Total coverage: 102499 (6%)of 1872620
5 15 15 14 5 5 5 4 5 5 5 1 1 1 1 1 1 1 5 5 5 5 5 5 5 3 3 3 3 35 36 1 1 2 2 35 35 35 32 20 20 20 20 5 20 20 20 19 1 5 5 5 5 12 12 7 5 2 3 4 4 4 4 4 4 4 4 1 4 4 1 3 3 1 5 4 2 5 12 12 12 7 5 5 209 204 207 2 193 5 5 4 4 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 /* BlueZ - Bluetooth protocol stack for Linux Copyright (C) 2000-2001 Qualcomm Incorporated Copyright (C) 2011 ProFUSION Embedded Systems Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as published by the Free Software Foundation; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS SOFTWARE IS DISCLAIMED. */ /* Bluetooth HCI core. */ #include <linux/export.h> #include <linux/rfkill.h> #include <linux/debugfs.h> #include <linux/crypto.h> #include <linux/kcov.h> #include <linux/property.h> #include <linux/suspend.h> #include <linux/wait.h> #include <linux/unaligned.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include <net/bluetooth/l2cap.h> #include <net/bluetooth/mgmt.h> #include "hci_debugfs.h" #include "smp.h" #include "leds.h" #include "msft.h" #include "aosp.h" #include "hci_codec.h" static void hci_rx_work(struct work_struct *work); static void hci_cmd_work(struct work_struct *work); static void hci_tx_work(struct work_struct *work); /* HCI device list */ LIST_HEAD(hci_dev_list); DEFINE_RWLOCK(hci_dev_list_lock); /* HCI callback list */ LIST_HEAD(hci_cb_list); DEFINE_MUTEX(hci_cb_list_lock); /* HCI ID Numbering */ static DEFINE_IDA(hci_index_ida); /* Get HCI device by index. * Device is held on return. */ static struct hci_dev *__hci_dev_get(int index, int *srcu_index) { struct hci_dev *hdev = NULL, *d; BT_DBG("%d", index); if (index < 0) return NULL; read_lock(&hci_dev_list_lock); list_for_each_entry(d, &hci_dev_list, list) { if (d->id == index) { hdev = hci_dev_hold(d); if (srcu_index) *srcu_index = srcu_read_lock(&d->srcu); break; } } read_unlock(&hci_dev_list_lock); return hdev; } struct hci_dev *hci_dev_get(int index) { return __hci_dev_get(index, NULL); } static struct hci_dev *hci_dev_get_srcu(int index, int *srcu_index) { return __hci_dev_get(index, srcu_index); } static void hci_dev_put_srcu(struct hci_dev *hdev, int srcu_index) { srcu_read_unlock(&hdev->srcu, srcu_index); hci_dev_put(hdev); } /* ---- Inquiry support ---- */ bool hci_discovery_active(struct hci_dev *hdev) { struct discovery_state *discov = &hdev->discovery; switch (discov->state) { case DISCOVERY_FINDING: case DISCOVERY_RESOLVING: return true; default: return false; } } void hci_discovery_set_state(struct hci_dev *hdev, int state) { int old_state = hdev->discovery.state; if (old_state == state) return; hdev->discovery.state = state; switch (state) { case DISCOVERY_STOPPED: hci_update_passive_scan(hdev); if (old_state != DISCOVERY_STARTING) mgmt_discovering(hdev, 0); break; case DISCOVERY_STARTING: break; case DISCOVERY_FINDING: mgmt_discovering(hdev, 1); break; case DISCOVERY_RESOLVING: break; case DISCOVERY_STOPPING: break; } bt_dev_dbg(hdev, "state %u -> %u", old_state, state); } void hci_inquiry_cache_flush(struct hci_dev *hdev) { struct discovery_state *cache = &hdev->discovery; struct inquiry_entry *p, *n; list_for_each_entry_safe(p, n, &cache->all, all) { list_del(&p->all); kfree(p); } INIT_LIST_HEAD(&cache->unknown); INIT_LIST_HEAD(&cache->resolve); } struct inquiry_entry *hci_inquiry_cache_lookup(struct hci_dev *hdev, bdaddr_t *bdaddr) { struct discovery_state *cache = &hdev->discovery; struct inquiry_entry *e; BT_DBG("cache %p, %pMR", cache, bdaddr); list_for_each_entry(e, &cache->all, all) { if (!bacmp(&e->data.bdaddr, bdaddr)) return e; } return NULL; } struct inquiry_entry *hci_inquiry_cache_lookup_unknown(struct hci_dev *hdev, bdaddr_t *bdaddr) { struct discovery_state *cache = &hdev->discovery; struct inquiry_entry *e; BT_DBG("cache %p, %pMR", cache, bdaddr); list_for_each_entry(e, &cache->unknown, list) { if (!bacmp(&e->data.bdaddr, bdaddr)) return e; } return NULL; } struct inquiry_entry *hci_inquiry_cache_lookup_resolve(struct hci_dev *hdev, bdaddr_t *bdaddr, int state) { struct discovery_state *cache = &hdev->discovery; struct inquiry_entry *e; BT_DBG("cache %p bdaddr %pMR state %d", cache, bdaddr, state); list_for_each_entry(e, &cache->resolve, list) { if (!bacmp(bdaddr, BDADDR_ANY) && e->name_state == state) return e; if (!bacmp(&e->data.bdaddr, bdaddr)) return e; } return NULL; } void hci_inquiry_cache_update_resolve(struct hci_dev *hdev, struct inquiry_entry *ie) { struct discovery_state *cache = &hdev->discovery; struct list_head *pos = &cache->resolve; struct inquiry_entry *p; list_del(&ie->list); list_for_each_entry(p, &cache->resolve, list) { if (p->name_state != NAME_PENDING && abs(p->data.rssi) >= abs(ie->data.rssi)) break; pos = &p->list; } list_add(&ie->list, pos); } u32 hci_inquiry_cache_update(struct hci_dev *hdev, struct inquiry_data *data, bool name_known) { struct discovery_state *cache = &hdev->discovery; struct inquiry_entry *ie; u32 flags = 0; BT_DBG("cache %p, %pMR", cache, &data->bdaddr); hci_remove_remote_oob_data(hdev, &data->bdaddr, BDADDR_BREDR); if (!data->ssp_mode) flags |= MGMT_DEV_FOUND_LEGACY_PAIRING; ie = hci_inquiry_cache_lookup(hdev, &data->bdaddr); if (ie) { if (!ie->data.ssp_mode) flags |= MGMT_DEV_FOUND_LEGACY_PAIRING; if (ie->name_state == NAME_NEEDED && data->rssi != ie->data.rssi) { ie->data.rssi = data->rssi; hci_inquiry_cache_update_resolve(hdev, ie); } goto update; } /* Entry not in the cache. Add new one. */ ie = kzalloc(sizeof(*ie), GFP_KERNEL); if (!ie) { flags |= MGMT_DEV_FOUND_CONFIRM_NAME; goto done; } list_add(&ie->all, &cache->all); if (name_known) { ie->name_state = NAME_KNOWN; } else { ie->name_state = NAME_NOT_KNOWN; list_add(&ie->list, &cache->unknown); } update: if (name_known && ie->name_state != NAME_KNOWN && ie->name_state != NAME_PENDING) { ie->name_state = NAME_KNOWN; list_del(&ie->list); } memcpy(&ie->data, data, sizeof(*data)); ie->timestamp = jiffies; cache->timestamp = jiffies; if (ie->name_state == NAME_NOT_KNOWN) flags |= MGMT_DEV_FOUND_CONFIRM_NAME; done: return flags; } static int inquiry_cache_dump(struct hci_dev *hdev, int num, __u8 *buf) { struct discovery_state *cache = &hdev->discovery; struct inquiry_info *info = (struct inquiry_info *) buf; struct inquiry_entry *e; int copied = 0; list_for_each_entry(e, &cache->all, all) { struct inquiry_data *data = &e->data; if (copied >= num) break; bacpy(&info->bdaddr, &data->bdaddr); info->pscan_rep_mode = data->pscan_rep_mode; info->pscan_period_mode = data->pscan_period_mode; info->pscan_mode = data->pscan_mode; memcpy(info->dev_class, data->dev_class, 3); info->clock_offset = data->clock_offset; info++; copied++; } BT_DBG("cache %p, copied %d", cache, copied); return copied; } int hci_inquiry(void __user *arg) { __u8 __user *ptr = arg; struct hci_inquiry_req ir; struct hci_dev *hdev; int err = 0, do_inquiry = 0, max_rsp; __u8 *buf; if (copy_from_user(&ir, ptr, sizeof(ir))) return -EFAULT; hdev = hci_dev_get(ir.dev_id); if (!hdev) return -ENODEV; if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { err = -EBUSY; goto done; } if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { err = -EOPNOTSUPP; goto done; } if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) { err = -EOPNOTSUPP; goto done; } /* Restrict maximum inquiry length to 60 seconds */ if (ir.length > 60) { err = -EINVAL; goto done; } hci_dev_lock(hdev); if (inquiry_cache_age(hdev) > INQUIRY_CACHE_AGE_MAX || inquiry_cache_empty(hdev) || ir.flags & IREQ_CACHE_FLUSH) { hci_inquiry_cache_flush(hdev); do_inquiry = 1; } hci_dev_unlock(hdev); if (do_inquiry) { hci_req_sync_lock(hdev); err = hci_inquiry_sync(hdev, ir.length, ir.num_rsp); hci_req_sync_unlock(hdev); if (err < 0) goto done; /* Wait until Inquiry procedure finishes (HCI_INQUIRY flag is * cleared). If it is interrupted by a signal, return -EINTR. */ if (wait_on_bit(&hdev->flags, HCI_INQUIRY, TASK_INTERRUPTIBLE)) { err = -EINTR; goto done; } } /* for unlimited number of responses we will use buffer with * 255 entries */ max_rsp = (ir.num_rsp == 0) ? 255 : ir.num_rsp; /* cache_dump can't sleep. Therefore we allocate temp buffer and then * copy it to the user space. */ buf = kmalloc_array(max_rsp, sizeof(struct inquiry_info), GFP_KERNEL); if (!buf) { err = -ENOMEM; goto done; } hci_dev_lock(hdev); ir.num_rsp = inquiry_cache_dump(hdev, max_rsp, buf); hci_dev_unlock(hdev); BT_DBG("num_rsp %d", ir.num_rsp); if (!copy_to_user(ptr, &ir, sizeof(ir))) { ptr += sizeof(ir); if (copy_to_user(ptr, buf, sizeof(struct inquiry_info) * ir.num_rsp)) err = -EFAULT; } else err = -EFAULT; kfree(buf); done: hci_dev_put(hdev); return err; } static int hci_dev_do_open(struct hci_dev *hdev) { int ret = 0; BT_DBG("%s %p", hdev->name, hdev); hci_req_sync_lock(hdev); ret = hci_dev_open_sync(hdev); hci_req_sync_unlock(hdev); return ret; } /* ---- HCI ioctl helpers ---- */ int hci_dev_open(__u16 dev) { struct hci_dev *hdev; int err; hdev = hci_dev_get(dev); if (!hdev) return -ENODEV; /* Devices that are marked as unconfigured can only be powered * up as user channel. Trying to bring them up as normal devices * will result into a failure. Only user channel operation is * possible. * * When this function is called for a user channel, the flag * HCI_USER_CHANNEL will be set first before attempting to * open the device. */ if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED) && !hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { err = -EOPNOTSUPP; goto done; } /* We need to ensure that no other power on/off work is pending * before proceeding to call hci_dev_do_open. This is * particularly important if the setup procedure has not yet * completed. */ if (hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF)) cancel_delayed_work(&hdev->power_off); /* After this call it is guaranteed that the setup procedure * has finished. This means that error conditions like RFKILL * or no valid public or static random address apply. */ flush_workqueue(hdev->req_workqueue); /* For controllers not using the management interface and that * are brought up using legacy ioctl, set the HCI_BONDABLE bit * so that pairing works for them. Once the management interface * is in use this bit will be cleared again and userspace has * to explicitly enable it. */ if (!hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && !hci_dev_test_flag(hdev, HCI_MGMT)) hci_dev_set_flag(hdev, HCI_BONDABLE); err = hci_dev_do_open(hdev); done: hci_dev_put(hdev); return err; } int hci_dev_do_close(struct hci_dev *hdev) { int err; BT_DBG("%s %p", hdev->name, hdev); hci_req_sync_lock(hdev); err = hci_dev_close_sync(hdev); hci_req_sync_unlock(hdev); return err; } int hci_dev_close(__u16 dev) { struct hci_dev *hdev; int err; hdev = hci_dev_get(dev); if (!hdev) return -ENODEV; if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { err = -EBUSY; goto done; } cancel_work_sync(&hdev->power_on); if (hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF)) cancel_delayed_work(&hdev->power_off); err = hci_dev_do_close(hdev); done: hci_dev_put(hdev); return err; } static int hci_dev_do_reset(struct hci_dev *hdev) { int ret; BT_DBG("%s %p", hdev->name, hdev); hci_req_sync_lock(hdev); /* Drop queues */ skb_queue_purge(&hdev->rx_q); skb_queue_purge(&hdev->cmd_q); /* Cancel these to avoid queueing non-chained pending work */ hci_dev_set_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE); /* Wait for * * if (!hci_dev_test_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE)) * queue_delayed_work(&hdev->{cmd,ncmd}_timer) * * inside RCU section to see the flag or complete scheduling. */ synchronize_rcu(); /* Explicitly cancel works in case scheduled after setting the flag. */ cancel_delayed_work(&hdev->cmd_timer); cancel_delayed_work(&hdev->ncmd_timer); /* Avoid potential lockdep warnings from the *_flush() calls by * ensuring the workqueue is empty up front. */ drain_workqueue(hdev->workqueue); hci_dev_lock(hdev); hci_inquiry_cache_flush(hdev); hci_conn_hash_flush(hdev); hci_dev_unlock(hdev); if (hdev->flush) hdev->flush(hdev); hci_dev_clear_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE); atomic_set(&hdev->cmd_cnt, 1); hdev->acl_cnt = 0; hdev->sco_cnt = 0; hdev->le_cnt = 0; hdev->iso_cnt = 0; ret = hci_reset_sync(hdev); hci_req_sync_unlock(hdev); return ret; } int hci_dev_reset(__u16 dev) { struct hci_dev *hdev; int err, srcu_index; hdev = hci_dev_get_srcu(dev, &srcu_index); if (!hdev) return -ENODEV; if (!test_bit(HCI_UP, &hdev->flags)) { err = -ENETDOWN; goto done; } if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { err = -EBUSY; goto done; } if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { err = -EOPNOTSUPP; goto done; } err = hci_dev_do_reset(hdev); done: hci_dev_put_srcu(hdev, srcu_index); return err; } int hci_dev_reset_stat(__u16 dev) { struct hci_dev *hdev; int ret = 0; hdev = hci_dev_get(dev); if (!hdev) return -ENODEV; if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { ret = -EBUSY; goto done; } if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { ret = -EOPNOTSUPP; goto done; } memset(&hdev->stat, 0, sizeof(struct hci_dev_stats)); done: hci_dev_put(hdev); return ret; } static void hci_update_passive_scan_state(struct hci_dev *hdev, u8 scan) { bool conn_changed, discov_changed; BT_DBG("%s scan 0x%02x", hdev->name, scan); if ((scan & SCAN_PAGE)) conn_changed = !hci_dev_test_and_set_flag(hdev, HCI_CONNECTABLE); else conn_changed = hci_dev_test_and_clear_flag(hdev, HCI_CONNECTABLE); if ((scan & SCAN_INQUIRY)) { discov_changed = !hci_dev_test_and_set_flag(hdev, HCI_DISCOVERABLE); } else { hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE); discov_changed = hci_dev_test_and_clear_flag(hdev, HCI_DISCOVERABLE); } if (!hci_dev_test_flag(hdev, HCI_MGMT)) return; if (conn_changed || discov_changed) { /* In case this was disabled through mgmt */ hci_dev_set_flag(hdev, HCI_BREDR_ENABLED); if (hci_dev_test_flag(hdev, HCI_LE_ENABLED)) hci_update_adv_data(hdev, hdev->cur_adv_instance); mgmt_new_settings(hdev); } } int hci_dev_cmd(unsigned int cmd, void __user *arg) { struct hci_dev *hdev; struct hci_dev_req dr; __le16 policy; int err = 0; if (copy_from_user(&dr, arg, sizeof(dr))) return -EFAULT; hdev = hci_dev_get(dr.dev_id); if (!hdev) return -ENODEV; if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { err = -EBUSY; goto done; } if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { err = -EOPNOTSUPP; goto done; } if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) { err = -EOPNOTSUPP; goto done; } switch (cmd) { case HCISETAUTH: err = hci_cmd_sync_status(hdev, HCI_OP_WRITE_AUTH_ENABLE, 1, &dr.dev_opt, HCI_CMD_TIMEOUT); break; case HCISETENCRYPT: if (!lmp_encrypt_capable(hdev)) { err = -EOPNOTSUPP; break; } if (!test_bit(HCI_AUTH, &hdev->flags)) { /* Auth must be enabled first */ err = hci_cmd_sync_status(hdev, HCI_OP_WRITE_AUTH_ENABLE, 1, &dr.dev_opt, HCI_CMD_TIMEOUT); if (err) break; } err = hci_cmd_sync_status(hdev, HCI_OP_WRITE_ENCRYPT_MODE, 1, &dr.dev_opt, HCI_CMD_TIMEOUT); break; case HCISETSCAN: err = hci_cmd_sync_status(hdev, HCI_OP_WRITE_SCAN_ENABLE, 1, &dr.dev_opt, HCI_CMD_TIMEOUT); /* Ensure that the connectable and discoverable states * get correctly modified as this was a non-mgmt change. */ if (!err) hci_update_passive_scan_state(hdev, dr.dev_opt); break; case HCISETLINKPOL: policy = cpu_to_le16(dr.dev_opt); err = hci_cmd_sync_status(hdev, HCI_OP_WRITE_DEF_LINK_POLICY, 2, &policy, HCI_CMD_TIMEOUT); break; case HCISETLINKMODE: hdev->link_mode = ((__u16) dr.dev_opt) & (HCI_LM_MASTER | HCI_LM_ACCEPT); break; case HCISETPTYPE: if (hdev->pkt_type == (__u16) dr.dev_opt) break; hdev->pkt_type = (__u16) dr.dev_opt; mgmt_phy_configuration_changed(hdev, NULL); break; case HCISETACLMTU: hdev->acl_mtu = *((__u16 *) &dr.dev_opt + 1); hdev->acl_pkts = *((__u16 *) &dr.dev_opt + 0); break; case HCISETSCOMTU: hdev->sco_mtu = *((__u16 *) &dr.dev_opt + 1); hdev->sco_pkts = *((__u16 *) &dr.dev_opt + 0); break; default: err = -EINVAL; break; } done: hci_dev_put(hdev); return err; } int hci_get_dev_list(void __user *arg) { struct hci_dev *hdev; struct hci_dev_list_req *dl; struct hci_dev_req *dr; int n = 0, err; __u16 dev_num; if (get_user(dev_num, (__u16 __user *) arg)) return -EFAULT; if (!dev_num || dev_num > (PAGE_SIZE * 2) / sizeof(*dr)) return -EINVAL; dl = kzalloc(struct_size(dl, dev_req, dev_num), GFP_KERNEL); if (!dl) return -ENOMEM; dl->dev_num = dev_num; dr = dl->dev_req; read_lock(&hci_dev_list_lock); list_for_each_entry(hdev, &hci_dev_list, list) { unsigned long flags = hdev->flags; /* When the auto-off is configured it means the transport * is running, but in that case still indicate that the * device is actually down. */ if (hci_dev_test_flag(hdev, HCI_AUTO_OFF)) flags &= ~BIT(HCI_UP); dr[n].dev_id = hdev->id; dr[n].dev_opt = flags; if (++n >= dev_num) break; } read_unlock(&hci_dev_list_lock); dl->dev_num = n; err = copy_to_user(arg, dl, struct_size(dl, dev_req, n)); kfree(dl); return err ? -EFAULT : 0; } int hci_get_dev_info(void __user *arg) { struct hci_dev *hdev; struct hci_dev_info di; unsigned long flags; int err = 0; if (copy_from_user(&di, arg, sizeof(di))) return -EFAULT; hdev = hci_dev_get(di.dev_id); if (!hdev) return -ENODEV; /* When the auto-off is configured it means the transport * is running, but in that case still indicate that the * device is actually down. */ if (hci_dev_test_flag(hdev, HCI_AUTO_OFF)) flags = hdev->flags & ~BIT(HCI_UP); else flags = hdev->flags; strscpy(di.name, hdev->name, sizeof(di.name)); di.bdaddr = hdev->bdaddr; di.type = (hdev->bus & 0x0f); di.flags = flags; di.pkt_type = hdev->pkt_type; if (lmp_bredr_capable(hdev)) { di.acl_mtu = hdev->acl_mtu; di.acl_pkts = hdev->acl_pkts; di.sco_mtu = hdev->sco_mtu; di.sco_pkts = hdev->sco_pkts; } else { di.acl_mtu = hdev->le_mtu; di.acl_pkts = hdev->le_pkts; di.sco_mtu = 0; di.sco_pkts = 0; } di.link_policy = hdev->link_policy; di.link_mode = hdev->link_mode; memcpy(&di.stat, &hdev->stat, sizeof(di.stat)); memcpy(&di.features, &hdev->features, sizeof(di.features)); if (copy_to_user(arg, &di, sizeof(di))) err = -EFAULT; hci_dev_put(hdev); return err; } /* ---- Interface to HCI drivers ---- */ static int hci_dev_do_poweroff(struct hci_dev *hdev) { int err; BT_DBG("%s %p", hdev->name, hdev); hci_req_sync_lock(hdev); err = hci_set_powered_sync(hdev, false); hci_req_sync_unlock(hdev); return err; } static int hci_rfkill_set_block(void *data, bool blocked) { struct hci_dev *hdev = data; int err; BT_DBG("%p name %s blocked %d", hdev, hdev->name, blocked); if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) return -EBUSY; if (blocked == hci_dev_test_flag(hdev, HCI_RFKILLED)) return 0; if (blocked) { hci_dev_set_flag(hdev, HCI_RFKILLED); if (!hci_dev_test_flag(hdev, HCI_SETUP) && !hci_dev_test_flag(hdev, HCI_CONFIG)) { err = hci_dev_do_poweroff(hdev); if (err) { bt_dev_err(hdev, "Error when powering off device on rfkill (%d)", err); /* Make sure the device is still closed even if * anything during power off sequence (eg. * disconnecting devices) failed. */ hci_dev_do_close(hdev); } } } else { hci_dev_clear_flag(hdev, HCI_RFKILLED); } return 0; } static const struct rfkill_ops hci_rfkill_ops = { .set_block = hci_rfkill_set_block, }; static void hci_power_on(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, power_on); int err; BT_DBG("%s", hdev->name); if (test_bit(HCI_UP, &hdev->flags) && hci_dev_test_flag(hdev, HCI_MGMT) && hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF)) { cancel_delayed_work(&hdev->power_off); err = hci_powered_update_sync(hdev); mgmt_power_on(hdev, err); return; } err = hci_dev_do_open(hdev); if (err < 0) { hci_dev_lock(hdev); mgmt_set_powered_failed(hdev, err); hci_dev_unlock(hdev); return; } /* During the HCI setup phase, a few error conditions are * ignored and they need to be checked now. If they are still * valid, it is important to turn the device back off. */ if (hci_dev_test_flag(hdev, HCI_RFKILLED) || hci_dev_test_flag(hdev, HCI_UNCONFIGURED) || (!bacmp(&hdev->bdaddr, BDADDR_ANY) && !bacmp(&hdev->static_addr, BDADDR_ANY))) { hci_dev_clear_flag(hdev, HCI_AUTO_OFF); hci_dev_do_close(hdev); } else if (hci_dev_test_flag(hdev, HCI_AUTO_OFF)) { queue_delayed_work(hdev->req_workqueue, &hdev->power_off, HCI_AUTO_OFF_TIMEOUT); } if (hci_dev_test_and_clear_flag(hdev, HCI_SETUP)) { /* For unconfigured devices, set the HCI_RAW flag * so that userspace can easily identify them. */ if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) set_bit(HCI_RAW, &hdev->flags); /* For fully configured devices, this will send * the Index Added event. For unconfigured devices, * it will send Unconfigued Index Added event. * * Devices with HCI_QUIRK_RAW_DEVICE are ignored * and no event will be send. */ mgmt_index_added(hdev); } else if (hci_dev_test_and_clear_flag(hdev, HCI_CONFIG)) { /* When the controller is now configured, then it * is important to clear the HCI_RAW flag. */ if (!hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) clear_bit(HCI_RAW, &hdev->flags); /* Powering on the controller with HCI_CONFIG set only * happens with the transition from unconfigured to * configured. This will send the Index Added event. */ mgmt_index_added(hdev); } } static void hci_power_off(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, power_off.work); BT_DBG("%s", hdev->name); hci_dev_do_close(hdev); } static void hci_error_reset(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, error_reset); hci_dev_hold(hdev); BT_DBG("%s", hdev->name); if (hdev->hw_error) hdev->hw_error(hdev, hdev->hw_error_code); else bt_dev_err(hdev, "hardware error 0x%2.2x", hdev->hw_error_code); if (!hci_dev_do_close(hdev)) hci_dev_do_open(hdev); hci_dev_put(hdev); } void hci_uuids_clear(struct hci_dev *hdev) { struct bt_uuid *uuid, *tmp; list_for_each_entry_safe(uuid, tmp, &hdev->uuids, list) { list_del(&uuid->list); kfree(uuid); } } void hci_link_keys_clear(struct hci_dev *hdev) { struct link_key *key, *tmp; list_for_each_entry_safe(key, tmp, &hdev->link_keys, list) { list_del_rcu(&key->list); kfree_rcu(key, rcu); } } void hci_smp_ltks_clear(struct hci_dev *hdev) { struct smp_ltk *k, *tmp; list_for_each_entry_safe(k, tmp, &hdev->long_term_keys, list) { list_del_rcu(&k->list); kfree_rcu(k, rcu); } } void hci_smp_irks_clear(struct hci_dev *hdev) { struct smp_irk *k, *tmp; list_for_each_entry_safe(k, tmp, &hdev->identity_resolving_keys, list) { list_del_rcu(&k->list); kfree_rcu(k, rcu); } } void hci_blocked_keys_clear(struct hci_dev *hdev) { struct blocked_key *b, *tmp; list_for_each_entry_safe(b, tmp, &hdev->blocked_keys, list) { list_del_rcu(&b->list); kfree_rcu(b, rcu); } } bool hci_is_blocked_key(struct hci_dev *hdev, u8 type, u8 val[16]) { bool blocked = false; struct blocked_key *b; rcu_read_lock(); list_for_each_entry_rcu(b, &hdev->blocked_keys, list) { if (b->type == type && !memcmp(b->val, val, sizeof(b->val))) { blocked = true; break; } } rcu_read_unlock(); return blocked; } struct link_key *hci_find_link_key(struct hci_dev *hdev, bdaddr_t *bdaddr) { struct link_key *k; rcu_read_lock(); list_for_each_entry_rcu(k, &hdev->link_keys, list) { if (bacmp(bdaddr, &k->bdaddr) == 0) { rcu_read_unlock(); if (hci_is_blocked_key(hdev, HCI_BLOCKED_KEY_TYPE_LINKKEY, k->val)) { bt_dev_warn_ratelimited(hdev, "Link key blocked for %pMR", &k->bdaddr); return NULL; } return k; } } rcu_read_unlock(); return NULL; } static bool hci_persistent_key(struct hci_dev *hdev, struct hci_conn *conn, u8 key_type, u8 old_key_type) { /* Legacy key */ if (key_type < 0x03) return true; /* Debug keys are insecure so don't store them persistently */ if (key_type == HCI_LK_DEBUG_COMBINATION) return false; /* Changed combination key and there's no previous one */ if (key_type == HCI_LK_CHANGED_COMBINATION && old_key_type == 0xff) return false; /* Security mode 3 case */ if (!conn) return true; /* BR/EDR key derived using SC from an LE link */ if (conn->type == LE_LINK) return true; /* Neither local nor remote side had no-bonding as requirement */ if (conn->auth_type > 0x01 && conn->remote_auth > 0x01) return true; /* Local side had dedicated bonding as requirement */ if (conn->auth_type == 0x02 || conn->auth_type == 0x03) return true; /* Remote side had dedicated bonding as requirement */ if (conn->remote_auth == 0x02 || conn->remote_auth == 0x03) return true; /* If none of the above criteria match, then don't store the key * persistently */ return false; } static u8 ltk_role(u8 type) { if (type == SMP_LTK) return HCI_ROLE_MASTER; return HCI_ROLE_SLAVE; } struct smp_ltk *hci_find_ltk(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 addr_type, u8 role) { struct smp_ltk *k; rcu_read_lock(); list_for_each_entry_rcu(k, &hdev->long_term_keys, list) { if (addr_type != k->bdaddr_type || bacmp(bdaddr, &k->bdaddr)) continue; if (smp_ltk_is_sc(k) || ltk_role(k->type) == role) { rcu_read_unlock(); if (hci_is_blocked_key(hdev, HCI_BLOCKED_KEY_TYPE_LTK, k->val)) { bt_dev_warn_ratelimited(hdev, "LTK blocked for %pMR", &k->bdaddr); return NULL; } return k; } } rcu_read_unlock(); return NULL; } struct smp_irk *hci_find_irk_by_rpa(struct hci_dev *hdev, bdaddr_t *rpa) { struct smp_irk *irk_to_return = NULL; struct smp_irk *irk; rcu_read_lock(); list_for_each_entry_rcu(irk, &hdev->identity_resolving_keys, list) { if (!bacmp(&irk->rpa, rpa)) { irk_to_return = irk; goto done; } } list_for_each_entry_rcu(irk, &hdev->identity_resolving_keys, list) { if (smp_irk_matches(hdev, irk->val, rpa)) { bacpy(&irk->rpa, rpa); irk_to_return = irk; goto done; } } done: if (irk_to_return && hci_is_blocked_key(hdev, HCI_BLOCKED_KEY_TYPE_IRK, irk_to_return->val)) { bt_dev_warn_ratelimited(hdev, "Identity key blocked for %pMR", &irk_to_return->bdaddr); irk_to_return = NULL; } rcu_read_unlock(); return irk_to_return; } struct smp_irk *hci_find_irk_by_addr(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 addr_type) { struct smp_irk *irk_to_return = NULL; struct smp_irk *irk; /* Identity Address must be public or static random */ if (addr_type == ADDR_LE_DEV_RANDOM && (bdaddr->b[5] & 0xc0) != 0xc0) return NULL; rcu_read_lock(); list_for_each_entry_rcu(irk, &hdev->identity_resolving_keys, list) { if (addr_type == irk->addr_type && bacmp(bdaddr, &irk->bdaddr) == 0) { irk_to_return = irk; break; } } if (irk_to_return && hci_is_blocked_key(hdev, HCI_BLOCKED_KEY_TYPE_IRK, irk_to_return->val)) { bt_dev_warn_ratelimited(hdev, "Identity key blocked for %pMR", &irk_to_return->bdaddr); irk_to_return = NULL; } rcu_read_unlock(); return irk_to_return; } struct link_key *hci_add_link_key(struct hci_dev *hdev, struct hci_conn *conn, bdaddr_t *bdaddr, u8 *val, u8 type, u8 pin_len, bool *persistent) { struct link_key *key, *old_key; u8 old_key_type; old_key = hci_find_link_key(hdev, bdaddr); if (old_key) { old_key_type = old_key->type; key = old_key; } else { old_key_type = conn ? conn->key_type : 0xff; key = kzalloc(sizeof(*key), GFP_KERNEL); if (!key) return NULL; list_add_rcu(&key->list, &hdev->link_keys); } BT_DBG("%s key for %pMR type %u", hdev->name, bdaddr, type); /* Some buggy controller combinations generate a changed * combination key for legacy pairing even when there's no * previous key */ if (type == HCI_LK_CHANGED_COMBINATION && (!conn || conn->remote_auth == 0xff) && old_key_type == 0xff) { type = HCI_LK_COMBINATION; if (conn) conn->key_type = type; } bacpy(&key->bdaddr, bdaddr); memcpy(key->val, val, HCI_LINK_KEY_SIZE); key->pin_len = pin_len; if (type == HCI_LK_CHANGED_COMBINATION) key->type = old_key_type; else key->type = type; if (persistent) *persistent = hci_persistent_key(hdev, conn, type, old_key_type); return key; } struct smp_ltk *hci_add_ltk(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 addr_type, u8 type, u8 authenticated, u8 tk[16], u8 enc_size, __le16 ediv, __le64 rand) { struct smp_ltk *key, *old_key; u8 role = ltk_role(type); old_key = hci_find_ltk(hdev, bdaddr, addr_type, role); if (old_key) key = old_key; else { key = kzalloc(sizeof(*key), GFP_KERNEL); if (!key) return NULL; list_add_rcu(&key->list, &hdev->long_term_keys); } bacpy(&key->bdaddr, bdaddr); key->bdaddr_type = addr_type; memcpy(key->val, tk, sizeof(key->val)); key->authenticated = authenticated; key->ediv = ediv; key->rand = rand; key->enc_size = enc_size; key->type = type; return key; } struct smp_irk *hci_add_irk(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 addr_type, u8 val[16], bdaddr_t *rpa) { struct smp_irk *irk; irk = hci_find_irk_by_addr(hdev, bdaddr, addr_type); if (!irk) { irk = kzalloc(sizeof(*irk), GFP_KERNEL); if (!irk) return NULL; bacpy(&irk->bdaddr, bdaddr); irk->addr_type = addr_type; list_add_rcu(&irk->list, &hdev->identity_resolving_keys); } memcpy(irk->val, val, 16); bacpy(&irk->rpa, rpa); return irk; } int hci_remove_link_key(struct hci_dev *hdev, bdaddr_t *bdaddr) { struct link_key *key; key = hci_find_link_key(hdev, bdaddr); if (!key) return -ENOENT; BT_DBG("%s removing %pMR", hdev->name, bdaddr); list_del_rcu(&key->list); kfree_rcu(key, rcu); return 0; } int hci_remove_ltk(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 bdaddr_type) { struct smp_ltk *k, *tmp; int removed = 0; list_for_each_entry_safe(k, tmp, &hdev->long_term_keys, list) { if (bacmp(bdaddr, &k->bdaddr) || k->bdaddr_type != bdaddr_type) continue; BT_DBG("%s removing %pMR", hdev->name, bdaddr); list_del_rcu(&k->list); kfree_rcu(k, rcu); removed++; } return removed ? 0 : -ENOENT; } void hci_remove_irk(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 addr_type) { struct smp_irk *k, *tmp; list_for_each_entry_safe(k, tmp, &hdev->identity_resolving_keys, list) { if (bacmp(bdaddr, &k->bdaddr) || k->addr_type != addr_type) continue; BT_DBG("%s removing %pMR", hdev->name, bdaddr); list_del_rcu(&k->list); kfree_rcu(k, rcu); } } bool hci_bdaddr_is_paired(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 type) { struct smp_ltk *k; struct smp_irk *irk; u8 addr_type; if (type == BDADDR_BREDR) { if (hci_find_link_key(hdev, bdaddr)) return true; return false; } /* Convert to HCI addr type which struct smp_ltk uses */ if (type == BDADDR_LE_PUBLIC) addr_type = ADDR_LE_DEV_PUBLIC; else addr_type = ADDR_LE_DEV_RANDOM; irk = hci_get_irk(hdev, bdaddr, addr_type); if (irk) { bdaddr = &irk->bdaddr; addr_type = irk->addr_type; } rcu_read_lock(); list_for_each_entry_rcu(k, &hdev->long_term_keys, list) { if (k->bdaddr_type == addr_type && !bacmp(bdaddr, &k->bdaddr)) { rcu_read_unlock(); return true; } } rcu_read_unlock(); return false; } /* HCI command timer function */ static void hci_cmd_timeout(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, cmd_timer.work); if (hdev->req_skb) { u16 opcode = hci_skb_opcode(hdev->req_skb); bt_dev_err(hdev, "command 0x%4.4x tx timeout", opcode); hci_cmd_sync_cancel_sync(hdev, ETIMEDOUT); } else { bt_dev_err(hdev, "command tx timeout"); } if (hdev->reset) hdev->reset(hdev); atomic_set(&hdev->cmd_cnt, 1); queue_work(hdev->workqueue, &hdev->cmd_work); } /* HCI ncmd timer function */ static void hci_ncmd_timeout(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, ncmd_timer.work); bt_dev_err(hdev, "Controller not accepting commands anymore: ncmd = 0"); /* During HCI_INIT phase no events can be injected if the ncmd timer * triggers since the procedure has its own timeout handling. */ if (test_bit(HCI_INIT, &hdev->flags)) return; /* This is an irrecoverable state, inject hardware error event */ hci_reset_dev(hdev); } struct oob_data *hci_find_remote_oob_data(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 bdaddr_type) { struct oob_data *data; list_for_each_entry(data, &hdev->remote_oob_data, list) { if (bacmp(bdaddr, &data->bdaddr) != 0) continue; if (data->bdaddr_type != bdaddr_type) continue; return data; } return NULL; } int hci_remove_remote_oob_data(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 bdaddr_type) { struct oob_data *data; data = hci_find_remote_oob_data(hdev, bdaddr, bdaddr_type); if (!data) return -ENOENT; BT_DBG("%s removing %pMR (%u)", hdev->name, bdaddr, bdaddr_type); list_del(&data->list); kfree(data); return 0; } void hci_remote_oob_data_clear(struct hci_dev *hdev) { struct oob_data *data, *n; list_for_each_entry_safe(data, n, &hdev->remote_oob_data, list) { list_del(&data->list); kfree(data); } } int hci_add_remote_oob_data(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 bdaddr_type, u8 *hash192, u8 *rand192, u8 *hash256, u8 *rand256) { struct oob_data *data; data = hci_find_remote_oob_data(hdev, bdaddr, bdaddr_type); if (!data) { data = kmalloc(sizeof(*data), GFP_KERNEL); if (!data) return -ENOMEM; bacpy(&data->bdaddr, bdaddr); data->bdaddr_type = bdaddr_type; list_add(&data->list, &hdev->remote_oob_data); } if (hash192 && rand192) { memcpy(data->hash192, hash192, sizeof(data->hash192)); memcpy(data->rand192, rand192, sizeof(data->rand192)); if (hash256 && rand256) data->present = 0x03; } else { memset(data->hash192, 0, sizeof(data->hash192)); memset(data->rand192, 0, sizeof(data->rand192)); if (hash256 && rand256) data->present = 0x02; else data->present = 0x00; } if (hash256 && rand256) { memcpy(data->hash256, hash256, sizeof(data->hash256)); memcpy(data->rand256, rand256, sizeof(data->rand256)); } else { memset(data->hash256, 0, sizeof(data->hash256)); memset(data->rand256, 0, sizeof(data->rand256)); if (hash192 && rand192) data->present = 0x01; } BT_DBG("%s for %pMR", hdev->name, bdaddr); return 0; } /* This function requires the caller holds hdev->lock */ struct adv_info *hci_find_adv_instance(struct hci_dev *hdev, u8 instance) { struct adv_info *adv_instance; list_for_each_entry(adv_instance, &hdev->adv_instances, list) { if (adv_instance->instance == instance) return adv_instance; } return NULL; } /* This function requires the caller holds hdev->lock */ struct adv_info *hci_find_adv_sid(struct hci_dev *hdev, u8 sid) { struct adv_info *adv; list_for_each_entry(adv, &hdev->adv_instances, list) { if (adv->sid == sid) return adv; } return NULL; } /* This function requires the caller holds hdev->lock */ struct adv_info *hci_get_next_instance(struct hci_dev *hdev, u8 instance) { struct adv_info *cur_instance; cur_instance = hci_find_adv_instance(hdev, instance); if (!cur_instance) return NULL; if (cur_instance == list_last_entry(&hdev->adv_instances, struct adv_info, list)) return list_first_entry(&hdev->adv_instances, struct adv_info, list); else return list_next_entry(cur_instance, list); } /* This function requires the caller holds hdev->lock */ int hci_remove_adv_instance(struct hci_dev *hdev, u8 instance) { struct adv_info *adv_instance; adv_instance = hci_find_adv_instance(hdev, instance); if (!adv_instance) return -ENOENT; BT_DBG("%s removing %dMR", hdev->name, instance); if (hdev->cur_adv_instance == instance) { if (hdev->adv_instance_timeout) { cancel_delayed_work(&hdev->adv_instance_expire); hdev->adv_instance_timeout = 0; } hdev->cur_adv_instance = 0x00; } cancel_delayed_work_sync(&adv_instance->rpa_expired_cb); list_del(&adv_instance->list); kfree(adv_instance); hdev->adv_instance_cnt--; return 0; } void hci_adv_instances_set_rpa_expired(struct hci_dev *hdev, bool rpa_expired) { struct adv_info *adv_instance, *n; list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances, list) adv_instance->rpa_expired = rpa_expired; } /* This function requires the caller holds hdev->lock */ void hci_adv_instances_clear(struct hci_dev *hdev) { struct adv_info *adv_instance, *n; if (hdev->adv_instance_timeout) { disable_delayed_work(&hdev->adv_instance_expire); hdev->adv_instance_timeout = 0; } list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances, list) { disable_delayed_work_sync(&adv_instance->rpa_expired_cb); list_del(&adv_instance->list); kfree(adv_instance); } hdev->adv_instance_cnt = 0; hdev->cur_adv_instance = 0x00; } static void adv_instance_rpa_expired(struct work_struct *work) { struct adv_info *adv_instance = container_of(work, struct adv_info, rpa_expired_cb.work); BT_DBG(""); adv_instance->rpa_expired = true; } /* This function requires the caller holds hdev->lock */ struct adv_info *hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags, u16 adv_data_len, u8 *adv_data, u16 scan_rsp_len, u8 *scan_rsp_data, u16 timeout, u16 duration, s8 tx_power, u32 min_interval, u32 max_interval, u8 mesh_handle) { struct adv_info *adv; adv = hci_find_adv_instance(hdev, instance); if (adv) { memset(adv->adv_data, 0, sizeof(adv->adv_data)); memset(adv->scan_rsp_data, 0, sizeof(adv->scan_rsp_data)); memset(adv->per_adv_data, 0, sizeof(adv->per_adv_data)); } else { if (hdev->adv_instance_cnt >= hdev->le_num_of_adv_sets || instance < 1 || instance > hdev->le_num_of_adv_sets + 1) return ERR_PTR(-EOVERFLOW); adv = kzalloc(sizeof(*adv), GFP_KERNEL); if (!adv) return ERR_PTR(-ENOMEM); adv->pending = true; adv->instance = instance; /* If controller support only one set and the instance is set to * 1 then there is no option other than using handle 0x00. */ if (hdev->le_num_of_adv_sets == 1 && instance == 1) adv->handle = 0x00; else adv->handle = instance; list_add(&adv->list, &hdev->adv_instances); hdev->adv_instance_cnt++; } adv->flags = flags; adv->min_interval = min_interval; adv->max_interval = max_interval; adv->tx_power = tx_power; /* Defining a mesh_handle changes the timing units to ms, * rather than seconds, and ties the instance to the requested * mesh_tx queue. */ adv->mesh = mesh_handle; hci_set_adv_instance_data(hdev, instance, adv_data_len, adv_data, scan_rsp_len, scan_rsp_data); adv->timeout = timeout; adv->remaining_time = timeout; if (duration == 0) adv->duration = hdev->def_multi_adv_rotation_duration; else adv->duration = duration; INIT_DELAYED_WORK(&adv->rpa_expired_cb, adv_instance_rpa_expired); BT_DBG("%s for %dMR", hdev->name, instance); return adv; } /* This function requires the caller holds hdev->lock */ struct adv_info *hci_add_per_instance(struct hci_dev *hdev, u8 instance, u8 sid, u32 flags, u8 data_len, u8 *data, u32 min_interval, u32 max_interval) { struct adv_info *adv; adv = hci_add_adv_instance(hdev, instance, flags, 0, NULL, 0, NULL, 0, 0, HCI_ADV_TX_POWER_NO_PREFERENCE, min_interval, max_interval, 0); if (IS_ERR(adv)) return adv; adv->sid = sid; adv->periodic = true; adv->per_adv_data_len = data_len; if (data) memcpy(adv->per_adv_data, data, data_len); return adv; } /* This function requires the caller holds hdev->lock */ int hci_set_adv_instance_data(struct hci_dev *hdev, u8 instance, u16 adv_data_len, u8 *adv_data, u16 scan_rsp_len, u8 *scan_rsp_data) { struct adv_info *adv; adv = hci_find_adv_instance(hdev, instance); /* If advertisement doesn't exist, we can't modify its data */ if (!adv) return -ENOENT; if (adv_data_len && ADV_DATA_CMP(adv, adv_data, adv_data_len)) { memset(adv->adv_data, 0, sizeof(adv->adv_data)); memcpy(adv->adv_data, adv_data, adv_data_len); adv->adv_data_len = adv_data_len; adv->adv_data_changed = true; } if (scan_rsp_len && SCAN_RSP_CMP(adv, scan_rsp_data, scan_rsp_len)) { memset(adv->scan_rsp_data, 0, sizeof(adv->scan_rsp_data)); memcpy(adv->scan_rsp_data, scan_rsp_data, scan_rsp_len); adv->scan_rsp_len = scan_rsp_len; adv->scan_rsp_changed = true; } /* Mark as changed if there are flags which would affect it */ if (((adv->flags & MGMT_ADV_FLAG_APPEARANCE) && hdev->appearance) || adv->flags & MGMT_ADV_FLAG_LOCAL_NAME) adv->scan_rsp_changed = true; return 0; } /* This function requires the caller holds hdev->lock */ u32 hci_adv_instance_flags(struct hci_dev *hdev, u8 instance) { u32 flags; struct adv_info *adv; if (instance == 0x00) { /* Instance 0 always manages the "Tx Power" and "Flags" * fields */ flags = MGMT_ADV_FLAG_TX_POWER | MGMT_ADV_FLAG_MANAGED_FLAGS; /* For instance 0, the HCI_ADVERTISING_CONNECTABLE setting * corresponds to the "connectable" instance flag. */ if (hci_dev_test_flag(hdev, HCI_ADVERTISING_CONNECTABLE)) flags |= MGMT_ADV_FLAG_CONNECTABLE; if (hci_dev_test_flag(hdev, HCI_LIMITED_DISCOVERABLE)) flags |= MGMT_ADV_FLAG_LIMITED_DISCOV; else if (hci_dev_test_flag(hdev, HCI_DISCOVERABLE)) flags |= MGMT_ADV_FLAG_DISCOV; return flags; } adv = hci_find_adv_instance(hdev, instance); /* Return 0 when we got an invalid instance identifier. */ if (!adv) return 0; return adv->flags; } bool hci_adv_instance_is_scannable(struct hci_dev *hdev, u8 instance) { struct adv_info *adv; /* Instance 0x00 always set local name */ if (instance == 0x00) return true; adv = hci_find_adv_instance(hdev, instance); if (!adv) return false; if (adv->flags & MGMT_ADV_FLAG_APPEARANCE || adv->flags & MGMT_ADV_FLAG_LOCAL_NAME) return true; return adv->scan_rsp_len ? true : false; } /* This function requires the caller holds hdev->lock */ void hci_adv_monitors_clear(struct hci_dev *hdev) { struct adv_monitor *monitor; int handle; idr_for_each_entry(&hdev->adv_monitors_idr, monitor, handle) hci_free_adv_monitor(hdev, monitor); idr_destroy(&hdev->adv_monitors_idr); } /* Frees the monitor structure and do some bookkeepings. * This function requires the caller holds hdev->lock. */ void hci_free_adv_monitor(struct hci_dev *hdev, struct adv_monitor *monitor) { struct adv_pattern *pattern; struct adv_pattern *tmp; if (!monitor) return; list_for_each_entry_safe(pattern, tmp, &monitor->patterns, list) { list_del(&pattern->list); kfree(pattern); } if (monitor->handle) idr_remove(&hdev->adv_monitors_idr, monitor->handle); if (monitor->state != ADV_MONITOR_STATE_NOT_REGISTERED) hdev->adv_monitors_cnt--; kfree(monitor); } /* Assigns handle to a monitor, and if offloading is supported and power is on, * also attempts to forward the request to the controller. * This function requires the caller holds hci_req_sync_lock. */ int hci_add_adv_monitor(struct hci_dev *hdev, struct adv_monitor *monitor) { int min, max, handle; int status = 0; if (!monitor) return -EINVAL; hci_dev_lock(hdev); min = HCI_MIN_ADV_MONITOR_HANDLE; max = HCI_MIN_ADV_MONITOR_HANDLE + HCI_MAX_ADV_MONITOR_NUM_HANDLES; handle = idr_alloc(&hdev->adv_monitors_idr, monitor, min, max, GFP_KERNEL); hci_dev_unlock(hdev); if (handle < 0) return handle; monitor->handle = handle; if (!hdev_is_powered(hdev)) return status; switch (hci_get_adv_monitor_offload_ext(hdev)) { case HCI_ADV_MONITOR_EXT_NONE: bt_dev_dbg(hdev, "add monitor %d status %d", monitor->handle, status); /* Message was not forwarded to controller - not an error */ break; case HCI_ADV_MONITOR_EXT_MSFT: status = msft_add_monitor_pattern(hdev, monitor); bt_dev_dbg(hdev, "add monitor %d msft status %d", handle, status); break; } return status; } /* Attempts to tell the controller and free the monitor. If somehow the * controller doesn't have a corresponding handle, remove anyway. * This function requires the caller holds hci_req_sync_lock. */ static int hci_remove_adv_monitor(struct hci_dev *hdev, struct adv_monitor *monitor) { int status = 0; int handle; switch (hci_get_adv_monitor_offload_ext(hdev)) { case HCI_ADV_MONITOR_EXT_NONE: /* also goes here when powered off */ bt_dev_dbg(hdev, "remove monitor %d status %d", monitor->handle, status); goto free_monitor; case HCI_ADV_MONITOR_EXT_MSFT: handle = monitor->handle; status = msft_remove_monitor(hdev, monitor); bt_dev_dbg(hdev, "remove monitor %d msft status %d", handle, status); break; } /* In case no matching handle registered, just free the monitor */ if (status == -ENOENT) goto free_monitor; return status; free_monitor: if (status == -ENOENT) bt_dev_warn(hdev, "Removing monitor with no matching handle %d", monitor->handle); hci_free_adv_monitor(hdev, monitor); return status; } /* This function requires the caller holds hci_req_sync_lock */ int hci_remove_single_adv_monitor(struct hci_dev *hdev, u16 handle) { struct adv_monitor *monitor = idr_find(&hdev->adv_monitors_idr, handle); if (!monitor) return -EINVAL; return hci_remove_adv_monitor(hdev, monitor); } /* This function requires the caller holds hci_req_sync_lock */ int hci_remove_all_adv_monitor(struct hci_dev *hdev) { struct adv_monitor *monitor; int idr_next_id = 0; int status = 0; while (1) { monitor = idr_get_next(&hdev->adv_monitors_idr, &idr_next_id); if (!monitor) break; status = hci_remove_adv_monitor(hdev, monitor); if (status) return status; idr_next_id++; } return status; } /* This function requires the caller holds hdev->lock */ bool hci_is_adv_monitoring(struct hci_dev *hdev) { return !idr_is_empty(&hdev->adv_monitors_idr); } int hci_get_adv_monitor_offload_ext(struct hci_dev *hdev) { if (msft_monitor_supported(hdev)) return HCI_ADV_MONITOR_EXT_MSFT; return HCI_ADV_MONITOR_EXT_NONE; } struct bdaddr_list *hci_bdaddr_list_lookup(struct list_head *bdaddr_list, bdaddr_t *bdaddr, u8 type) { struct bdaddr_list *b; list_for_each_entry(b, bdaddr_list, list) { if (!bacmp(&b->bdaddr, bdaddr) && b->bdaddr_type == type) return b; } return NULL; } struct bdaddr_list_with_irk *hci_bdaddr_list_lookup_with_irk( struct list_head *bdaddr_list, bdaddr_t *bdaddr, u8 type) { struct bdaddr_list_with_irk *b; list_for_each_entry(b, bdaddr_list, list) { if (!bacmp(&b->bdaddr, bdaddr) && b->bdaddr_type == type) return b; } return NULL; } struct bdaddr_list_with_flags * hci_bdaddr_list_lookup_with_flags(struct list_head *bdaddr_list, bdaddr_t *bdaddr, u8 type) { struct bdaddr_list_with_flags *b; list_for_each_entry(b, bdaddr_list, list) { if (!bacmp(&b->bdaddr, bdaddr) && b->bdaddr_type == type) return b; } return NULL; } void hci_bdaddr_list_clear(struct list_head *bdaddr_list) { struct bdaddr_list *b, *n; list_for_each_entry_safe(b, n, bdaddr_list, list) { list_del(&b->list); kfree(b); } } int hci_bdaddr_list_add(struct list_head *list, bdaddr_t *bdaddr, u8 type) { struct bdaddr_list *entry; if (!bacmp(bdaddr, BDADDR_ANY)) return -EBADF; if (hci_bdaddr_list_lookup(list, bdaddr, type)) return -EEXIST; entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return -ENOMEM; bacpy(&entry->bdaddr, bdaddr); entry->bdaddr_type = type; list_add(&entry->list, list); return 0; } int hci_bdaddr_list_add_with_irk(struct list_head *list, bdaddr_t *bdaddr, u8 type, u8 *peer_irk, u8 *local_irk) { struct bdaddr_list_with_irk *entry; if (!bacmp(bdaddr, BDADDR_ANY)) return -EBADF; if (hci_bdaddr_list_lookup(list, bdaddr, type)) return -EEXIST; entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return -ENOMEM; bacpy(&entry->bdaddr, bdaddr); entry->bdaddr_type = type; if (peer_irk) memcpy(entry->peer_irk, peer_irk, 16); if (local_irk) memcpy(entry->local_irk, local_irk, 16); list_add(&entry->list, list); return 0; } int hci_bdaddr_list_add_with_flags(struct list_head *list, bdaddr_t *bdaddr, u8 type, u32 flags) { struct bdaddr_list_with_flags *entry; if (!bacmp(bdaddr, BDADDR_ANY)) return -EBADF; if (hci_bdaddr_list_lookup(list, bdaddr, type)) return -EEXIST; entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return -ENOMEM; bacpy(&entry->bdaddr, bdaddr); entry->bdaddr_type = type; entry->flags = flags; list_add(&entry->list, list); return 0; } int hci_bdaddr_list_del(struct list_head *list, bdaddr_t *bdaddr, u8 type) { struct bdaddr_list *entry; if (!bacmp(bdaddr, BDADDR_ANY)) { hci_bdaddr_list_clear(list); return 0; } entry = hci_bdaddr_list_lookup(list, bdaddr, type); if (!entry) return -ENOENT; list_del(&entry->list); kfree(entry); return 0; } int hci_bdaddr_list_del_with_irk(struct list_head *list, bdaddr_t *bdaddr, u8 type) { struct bdaddr_list_with_irk *entry; if (!bacmp(bdaddr, BDADDR_ANY)) { hci_bdaddr_list_clear(list); return 0; } entry = hci_bdaddr_list_lookup_with_irk(list, bdaddr, type); if (!entry) return -ENOENT; list_del(&entry->list); kfree(entry); return 0; } /* This function requires the caller holds hdev->lock */ struct hci_conn_params *hci_conn_params_lookup(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type) { struct hci_conn_params *params; list_for_each_entry(params, &hdev->le_conn_params, list) { if (bacmp(&params->addr, addr) == 0 && params->addr_type == addr_type) { return params; } } return NULL; } /* This function requires the caller holds hdev->lock or rcu_read_lock */ struct hci_conn_params *hci_pend_le_action_lookup(struct list_head *list, bdaddr_t *addr, u8 addr_type) { struct hci_conn_params *param; rcu_read_lock(); list_for_each_entry_rcu(param, list, action) { if (bacmp(&param->addr, addr) == 0 && param->addr_type == addr_type) { rcu_read_unlock(); return param; } } rcu_read_unlock(); return NULL; } /* This function requires the caller holds hdev->lock */ void hci_pend_le_list_del_init(struct hci_conn_params *param) { if (list_empty(&param->action)) return; list_del_rcu(&param->action); synchronize_rcu(); INIT_LIST_HEAD(&param->action); } /* This function requires the caller holds hdev->lock */ void hci_pend_le_list_add(struct hci_conn_params *param, struct list_head *list) { list_add_rcu(&param->action, list); } /* This function requires the caller holds hdev->lock */ struct hci_conn_params *hci_conn_params_add(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type) { struct hci_conn_params *params; params = hci_conn_params_lookup(hdev, addr, addr_type); if (params) return params; params = kzalloc(sizeof(*params), GFP_KERNEL); if (!params) { bt_dev_err(hdev, "out of memory"); return NULL; } bacpy(&params->addr, addr); params->addr_type = addr_type; list_add(&params->list, &hdev->le_conn_params); INIT_LIST_HEAD(&params->action); params->conn_min_interval = hdev->le_conn_min_interval; params->conn_max_interval = hdev->le_conn_max_interval; params->conn_latency = hdev->le_conn_latency; params->supervision_timeout = hdev->le_supv_timeout; params->auto_connect = HCI_AUTO_CONN_DISABLED; BT_DBG("addr %pMR (type %u)", addr, addr_type); return params; } void hci_conn_params_free(struct hci_conn_params *params) { hci_pend_le_list_del_init(params); if (params->conn) { hci_conn_drop(params->conn); hci_conn_put(params->conn); } list_del(&params->list); kfree(params); } /* This function requires the caller holds hdev->lock */ void hci_conn_params_del(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type) { struct hci_conn_params *params; params = hci_conn_params_lookup(hdev, addr, addr_type); if (!params) return; hci_conn_params_free(params); hci_update_passive_scan(hdev); BT_DBG("addr %pMR (type %u)", addr, addr_type); } /* This function requires the caller holds hdev->lock */ void hci_conn_params_clear_disabled(struct hci_dev *hdev) { struct hci_conn_params *params, *tmp; list_for_each_entry_safe(params, tmp, &hdev->le_conn_params, list) { if (params->auto_connect != HCI_AUTO_CONN_DISABLED) continue; /* If trying to establish one time connection to disabled * device, leave the params, but mark them as just once. */ if (params->explicit_connect) { params->auto_connect = HCI_AUTO_CONN_EXPLICIT; continue; } hci_conn_params_free(params); } BT_DBG("All LE disabled connection parameters were removed"); } /* This function requires the caller holds hdev->lock */ static void hci_conn_params_clear_all(struct hci_dev *hdev) { struct hci_conn_params *params, *tmp; list_for_each_entry_safe(params, tmp, &hdev->le_conn_params, list) hci_conn_params_free(params); BT_DBG("All LE connection parameters were removed"); } /* Copy the Identity Address of the controller. * * If the controller has a public BD_ADDR, then by default use that one. * If this is a LE only controller without a public address, default to * the static random address. * * For debugging purposes it is possible to force controllers with a * public address to use the static random address instead. * * In case BR/EDR has been disabled on a dual-mode controller and * userspace has configured a static address, then that address * becomes the identity address instead of the public BR/EDR address. */ void hci_copy_identity_address(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 *bdaddr_type) { if (hci_dev_test_flag(hdev, HCI_FORCE_STATIC_ADDR) || !bacmp(&hdev->bdaddr, BDADDR_ANY) || (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED) && bacmp(&hdev->static_addr, BDADDR_ANY))) { bacpy(bdaddr, &hdev->static_addr); *bdaddr_type = ADDR_LE_DEV_RANDOM; } else { bacpy(bdaddr, &hdev->bdaddr); *bdaddr_type = ADDR_LE_DEV_PUBLIC; } } static void hci_clear_wake_reason(struct hci_dev *hdev) { hci_dev_lock(hdev); hdev->wake_reason = 0; bacpy(&hdev->wake_addr, BDADDR_ANY); hdev->wake_addr_type = 0; hci_dev_unlock(hdev); } static int hci_suspend_notifier(struct notifier_block *nb, unsigned long action, void *data) { struct hci_dev *hdev = container_of(nb, struct hci_dev, suspend_notifier); int ret = 0; /* Userspace has full control of this device. Do nothing. */ if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) return NOTIFY_DONE; /* To avoid a potential race with hci_unregister_dev. */ hci_dev_hold(hdev); switch (action) { case PM_HIBERNATION_PREPARE: case PM_SUSPEND_PREPARE: ret = hci_suspend_dev(hdev); break; case PM_POST_HIBERNATION: case PM_POST_SUSPEND: ret = hci_resume_dev(hdev); break; } if (ret) bt_dev_err(hdev, "Suspend notifier action (%lu) failed: %d", action, ret); hci_dev_put(hdev); return NOTIFY_DONE; } /* Alloc HCI device */ struct hci_dev *hci_alloc_dev_priv(int sizeof_priv) { struct hci_dev *hdev; unsigned int alloc_size; alloc_size = sizeof(*hdev); if (sizeof_priv) { /* Fixme: May need ALIGN-ment? */ alloc_size += sizeof_priv; } hdev = kzalloc(alloc_size, GFP_KERNEL); if (!hdev) return NULL; if (init_srcu_struct(&hdev->srcu)) { kfree(hdev); return NULL; } hdev->pkt_type = (HCI_DM1 | HCI_DH1 | HCI_HV1); hdev->esco_type = (ESCO_HV1); hdev->link_mode = (HCI_LM_ACCEPT); hdev->num_iac = 0x01; /* One IAC support is mandatory */ hdev->io_capability = 0x03; /* No Input No Output */ hdev->manufacturer = 0xffff; /* Default to internal use */ hdev->inq_tx_power = HCI_TX_POWER_INVALID; hdev->adv_tx_power = HCI_TX_POWER_INVALID; hdev->adv_instance_cnt = 0; hdev->cur_adv_instance = 0x00; hdev->adv_instance_timeout = 0; hdev->advmon_allowlist_duration = 300; hdev->advmon_no_filter_duration = 500; hdev->enable_advmon_interleave_scan = 0x00; /* Default to disable */ hdev->sniff_max_interval = 800; hdev->sniff_min_interval = 80; hdev->le_adv_channel_map = 0x07; hdev->le_adv_min_interval = 0x0800; hdev->le_adv_max_interval = 0x0800; hdev->le_scan_interval = DISCOV_LE_SCAN_INT_FAST; hdev->le_scan_window = DISCOV_LE_SCAN_WIN_FAST; hdev->le_scan_int_suspend = DISCOV_LE_SCAN_INT_SLOW1; hdev->le_scan_window_suspend = DISCOV_LE_SCAN_WIN_SLOW1; hdev->le_scan_int_discovery = DISCOV_LE_SCAN_INT; hdev->le_scan_window_discovery = DISCOV_LE_SCAN_WIN; hdev->le_scan_int_adv_monitor = DISCOV_LE_SCAN_INT_FAST; hdev->le_scan_window_adv_monitor = DISCOV_LE_SCAN_WIN_FAST; hdev->le_scan_int_connect = DISCOV_LE_SCAN_INT_CONN; hdev->le_scan_window_connect = DISCOV_LE_SCAN_WIN_CONN; hdev->le_conn_min_interval = 0x0018; hdev->le_conn_max_interval = 0x0028; hdev->le_conn_latency = 0x0000; hdev->le_supv_timeout = 0x002a; hdev->le_def_tx_len = 0x001b; hdev->le_def_tx_time = 0x0148; hdev->le_max_tx_len = 0x001b; hdev->le_max_tx_time = 0x0148; hdev->le_max_rx_len = 0x001b; hdev->le_max_rx_time = 0x0148; hdev->le_max_key_size = SMP_MAX_ENC_KEY_SIZE; hdev->le_min_key_size = SMP_MIN_ENC_KEY_SIZE; hdev->le_tx_def_phys = HCI_LE_SET_PHY_1M; hdev->le_rx_def_phys = HCI_LE_SET_PHY_1M; hdev->le_num_of_adv_sets = HCI_MAX_ADV_INSTANCES; hdev->def_multi_adv_rotation_duration = HCI_DEFAULT_ADV_DURATION; hdev->def_le_autoconnect_timeout = HCI_LE_CONN_TIMEOUT; hdev->min_le_tx_power = HCI_TX_POWER_INVALID; hdev->max_le_tx_power = HCI_TX_POWER_INVALID; hdev->rpa_timeout = HCI_DEFAULT_RPA_TIMEOUT; hdev->discov_interleaved_timeout = DISCOV_INTERLEAVED_TIMEOUT; hdev->conn_info_min_age = DEFAULT_CONN_INFO_MIN_AGE; hdev->conn_info_max_age = DEFAULT_CONN_INFO_MAX_AGE; hdev->auth_payload_timeout = DEFAULT_AUTH_PAYLOAD_TIMEOUT; hdev->min_enc_key_size = HCI_MIN_ENC_KEY_SIZE; /* default 1.28 sec page scan */ hdev->def_page_scan_type = PAGE_SCAN_TYPE_STANDARD; hdev->def_page_scan_int = 0x0800; hdev->def_page_scan_window = 0x0012; mutex_init(&hdev->lock); mutex_init(&hdev->req_lock); mutex_init(&hdev->mgmt_pending_lock); ida_init(&hdev->unset_handle_ida); INIT_LIST_HEAD(&hdev->mesh_pending); INIT_LIST_HEAD(&hdev->mgmt_pending); INIT_LIST_HEAD(&hdev->reject_list); INIT_LIST_HEAD(&hdev->accept_list); INIT_LIST_HEAD(&hdev->uuids); INIT_LIST_HEAD(&hdev->link_keys); INIT_LIST_HEAD(&hdev->long_term_keys); INIT_LIST_HEAD(&hdev->identity_resolving_keys); INIT_LIST_HEAD(&hdev->remote_oob_data); INIT_LIST_HEAD(&hdev->le_accept_list); INIT_LIST_HEAD(&hdev->le_resolv_list); INIT_LIST_HEAD(&hdev->le_conn_params); INIT_LIST_HEAD(&hdev->pend_le_conns); INIT_LIST_HEAD(&hdev->pend_le_reports); INIT_LIST_HEAD(&hdev->conn_hash.list); INIT_LIST_HEAD(&hdev->adv_instances); INIT_LIST_HEAD(&hdev->blocked_keys); INIT_LIST_HEAD(&hdev->monitored_devices); INIT_LIST_HEAD(&hdev->local_codecs); INIT_WORK(&hdev->rx_work, hci_rx_work); INIT_WORK(&hdev->cmd_work, hci_cmd_work); INIT_WORK(&hdev->tx_work, hci_tx_work); INIT_WORK(&hdev->power_on, hci_power_on); INIT_WORK(&hdev->error_reset, hci_error_reset); hci_cmd_sync_init(hdev); INIT_DELAYED_WORK(&hdev->power_off, hci_power_off); skb_queue_head_init(&hdev->rx_q); skb_queue_head_init(&hdev->cmd_q); skb_queue_head_init(&hdev->raw_q); init_waitqueue_head(&hdev->req_wait_q); INIT_DELAYED_WORK(&hdev->cmd_timer, hci_cmd_timeout); INIT_DELAYED_WORK(&hdev->ncmd_timer, hci_ncmd_timeout); hci_devcd_setup(hdev); hci_init_sysfs(hdev); discovery_init(hdev); return hdev; } EXPORT_SYMBOL(hci_alloc_dev_priv); /* Free HCI device */ void hci_free_dev(struct hci_dev *hdev) { /* will free via device release */ put_device(&hdev->dev); } EXPORT_SYMBOL(hci_free_dev); /* Register HCI device */ int hci_register_dev(struct hci_dev *hdev) { int id, error; if (!hdev->open || !hdev->close || !hdev->send) return -EINVAL; id = ida_alloc_max(&hci_index_ida, HCI_MAX_ID - 1, GFP_KERNEL); if (id < 0) return id; error = dev_set_name(&hdev->dev, "hci%u", id); if (error) return error; hdev->name = dev_name(&hdev->dev); hdev->id = id; BT_DBG("%p name %s bus %d", hdev, hdev->name, hdev->bus); hdev->workqueue = alloc_ordered_workqueue("%s", WQ_HIGHPRI, hdev->name); if (!hdev->workqueue) { error = -ENOMEM; goto err; } hdev->req_workqueue = alloc_ordered_workqueue("%s", WQ_HIGHPRI, hdev->name); if (!hdev->req_workqueue) { destroy_workqueue(hdev->workqueue); error = -ENOMEM; goto err; } if (!IS_ERR_OR_NULL(bt_debugfs)) hdev->debugfs = debugfs_create_dir(hdev->name, bt_debugfs); error = device_add(&hdev->dev); if (error < 0) goto err_wqueue; hci_leds_init(hdev); hdev->rfkill = rfkill_alloc(hdev->name, &hdev->dev, RFKILL_TYPE_BLUETOOTH, &hci_rfkill_ops, hdev); if (hdev->rfkill) { if (rfkill_register(hdev->rfkill) < 0) { rfkill_destroy(hdev->rfkill); hdev->rfkill = NULL; } } if (hdev->rfkill && rfkill_blocked(hdev->rfkill)) hci_dev_set_flag(hdev, HCI_RFKILLED); hci_dev_set_flag(hdev, HCI_SETUP); hci_dev_set_flag(hdev, HCI_AUTO_OFF); /* Assume BR/EDR support until proven otherwise (such as * through reading supported features during init. */ hci_dev_set_flag(hdev, HCI_BREDR_ENABLED); write_lock(&hci_dev_list_lock); list_add(&hdev->list, &hci_dev_list); write_unlock(&hci_dev_list_lock); /* Devices that are marked for raw-only usage are unconfigured * and should not be included in normal operation. */ if (hci_test_quirk(hdev, HCI_QUIRK_RAW_DEVICE)) hci_dev_set_flag(hdev, HCI_UNCONFIGURED); /* Mark Remote Wakeup connection flag as supported if driver has wakeup * callback. */ if (hdev->wakeup) hdev->conn_flags |= HCI_CONN_FLAG_REMOTE_WAKEUP; hci_sock_dev_event(hdev, HCI_DEV_REG); hci_dev_hold(hdev); error = hci_register_suspend_notifier(hdev); if (error) BT_WARN("register suspend notifier failed error:%d\n", error); queue_work(hdev->req_workqueue, &hdev->power_on); idr_init(&hdev->adv_monitors_idr); msft_register(hdev); return id; err_wqueue: debugfs_remove_recursive(hdev->debugfs); destroy_workqueue(hdev->workqueue); destroy_workqueue(hdev->req_workqueue); err: ida_free(&hci_index_ida, hdev->id); return error; } EXPORT_SYMBOL(hci_register_dev); /* Unregister HCI device */ void hci_unregister_dev(struct hci_dev *hdev) { BT_DBG("%p name %s bus %d", hdev, hdev->name, hdev->bus); mutex_lock(&hdev->unregister_lock); hci_dev_set_flag(hdev, HCI_UNREGISTER); mutex_unlock(&hdev->unregister_lock); write_lock(&hci_dev_list_lock); list_del(&hdev->list); write_unlock(&hci_dev_list_lock); synchronize_srcu(&hdev->srcu); cleanup_srcu_struct(&hdev->srcu); disable_work_sync(&hdev->rx_work); disable_work_sync(&hdev->cmd_work); disable_work_sync(&hdev->tx_work); disable_work_sync(&hdev->power_on); disable_work_sync(&hdev->error_reset); hci_cmd_sync_clear(hdev); hci_unregister_suspend_notifier(hdev); hci_dev_do_close(hdev); if (!test_bit(HCI_INIT, &hdev->flags) && !hci_dev_test_flag(hdev, HCI_SETUP) && !hci_dev_test_flag(hdev, HCI_CONFIG)) { hci_dev_lock(hdev); mgmt_index_removed(hdev); hci_dev_unlock(hdev); } /* mgmt_index_removed should take care of emptying the * pending list */ BUG_ON(!list_empty(&hdev->mgmt_pending)); hci_sock_dev_event(hdev, HCI_DEV_UNREG); if (hdev->rfkill) { rfkill_unregister(hdev->rfkill); rfkill_destroy(hdev->rfkill); } device_del(&hdev->dev); /* Actual cleanup is deferred until hci_release_dev(). */ hci_dev_put(hdev); } EXPORT_SYMBOL(hci_unregister_dev); /* Release HCI device */ void hci_release_dev(struct hci_dev *hdev) { debugfs_remove_recursive(hdev->debugfs); kfree_const(hdev->hw_info); kfree_const(hdev->fw_info); destroy_workqueue(hdev->workqueue); destroy_workqueue(hdev->req_workqueue); hci_dev_lock(hdev); hci_bdaddr_list_clear(&hdev->reject_list); hci_bdaddr_list_clear(&hdev->accept_list); hci_uuids_clear(hdev); hci_link_keys_clear(hdev); hci_smp_ltks_clear(hdev); hci_smp_irks_clear(hdev); hci_remote_oob_data_clear(hdev); hci_adv_instances_clear(hdev); hci_adv_monitors_clear(hdev); hci_bdaddr_list_clear(&hdev->le_accept_list); hci_bdaddr_list_clear(&hdev->le_resolv_list); hci_conn_params_clear_all(hdev); hci_discovery_filter_clear(hdev); hci_blocked_keys_clear(hdev); hci_codec_list_clear(&hdev->local_codecs); msft_release(hdev); hci_dev_unlock(hdev); ida_destroy(&hdev->unset_handle_ida); ida_free(&hci_index_ida, hdev->id); kfree_skb(hdev->sent_cmd); kfree_skb(hdev->req_skb); kfree_skb(hdev->recv_event); kfree(hdev); } EXPORT_SYMBOL(hci_release_dev); int hci_register_suspend_notifier(struct hci_dev *hdev) { int ret = 0; if (!hdev->suspend_notifier.notifier_call && !hci_test_quirk(hdev, HCI_QUIRK_NO_SUSPEND_NOTIFIER)) { hdev->suspend_notifier.notifier_call = hci_suspend_notifier; ret = register_pm_notifier(&hdev->suspend_notifier); } return ret; } int hci_unregister_suspend_notifier(struct hci_dev *hdev) { int ret = 0; if (hdev->suspend_notifier.notifier_call) { ret = unregister_pm_notifier(&hdev->suspend_notifier); if (!ret) hdev->suspend_notifier.notifier_call = NULL; } return ret; } /* Cancel ongoing command synchronously: * * - Cancel command timer * - Reset command counter * - Cancel command request */ static void hci_cancel_cmd_sync(struct hci_dev *hdev, int err) { bt_dev_dbg(hdev, "err 0x%2.2x", err); if (hci_dev_test_flag(hdev, HCI_UNREGISTER)) { disable_delayed_work_sync(&hdev->cmd_timer); disable_delayed_work_sync(&hdev->ncmd_timer); } else { cancel_delayed_work_sync(&hdev->cmd_timer); cancel_delayed_work_sync(&hdev->ncmd_timer); } atomic_set(&hdev->cmd_cnt, 1); hci_cmd_sync_cancel_sync(hdev, err); } /* Suspend HCI device */ int hci_suspend_dev(struct hci_dev *hdev) { int ret; bt_dev_dbg(hdev, ""); /* Suspend should only act on when powered. */ if (!hdev_is_powered(hdev) || hci_dev_test_flag(hdev, HCI_UNREGISTER)) return 0; /* If powering down don't attempt to suspend */ if (mgmt_powering_down(hdev)) return 0; /* Cancel potentially blocking sync operation before suspend */ hci_cancel_cmd_sync(hdev, EHOSTDOWN); hci_req_sync_lock(hdev); ret = hci_suspend_sync(hdev); hci_req_sync_unlock(hdev); hci_clear_wake_reason(hdev); mgmt_suspending(hdev, hdev->suspend_state); hci_sock_dev_event(hdev, HCI_DEV_SUSPEND); return ret; } EXPORT_SYMBOL(hci_suspend_dev); /* Resume HCI device */ int hci_resume_dev(struct hci_dev *hdev) { int ret; bt_dev_dbg(hdev, ""); /* Resume should only act on when powered. */ if (!hdev_is_powered(hdev) || hci_dev_test_flag(hdev, HCI_UNREGISTER)) return 0; /* If powering down don't attempt to resume */ if (mgmt_powering_down(hdev)) return 0; hci_req_sync_lock(hdev); ret = hci_resume_sync(hdev); hci_req_sync_unlock(hdev); mgmt_resuming(hdev, hdev->wake_reason, &hdev->wake_addr, hdev->wake_addr_type); hci_sock_dev_event(hdev, HCI_DEV_RESUME); return ret; } EXPORT_SYMBOL(hci_resume_dev); /* Reset HCI device */ int hci_reset_dev(struct hci_dev *hdev) { static const u8 hw_err[] = { HCI_EV_HARDWARE_ERROR, 0x01, 0x00 }; struct sk_buff *skb; skb = bt_skb_alloc(3, GFP_ATOMIC); if (!skb) return -ENOMEM; hci_skb_pkt_type(skb) = HCI_EVENT_PKT; skb_put_data(skb, hw_err, 3); bt_dev_err(hdev, "Injecting HCI hardware error event"); /* Send Hardware Error to upper stack */ return hci_recv_frame(hdev, skb); } EXPORT_SYMBOL(hci_reset_dev); static u8 hci_dev_classify_pkt_type(struct hci_dev *hdev, struct sk_buff *skb) { if (hdev->classify_pkt_type) return hdev->classify_pkt_type(hdev, skb); return hci_skb_pkt_type(skb); } /* Receive frame from HCI drivers */ int hci_recv_frame(struct hci_dev *hdev, struct sk_buff *skb) { u8 dev_pkt_type; if (!hdev || (!test_bit(HCI_UP, &hdev->flags) && !test_bit(HCI_INIT, &hdev->flags))) { kfree_skb(skb); return -ENXIO; } /* Check if the driver agree with packet type classification */ dev_pkt_type = hci_dev_classify_pkt_type(hdev, skb); if (hci_skb_pkt_type(skb) != dev_pkt_type) { hci_skb_pkt_type(skb) = dev_pkt_type; } switch (hci_skb_pkt_type(skb)) { case HCI_EVENT_PKT: break; case HCI_ACLDATA_PKT: /* Detect if ISO packet has been sent as ACL */ if (hci_conn_num(hdev, CIS_LINK) || hci_conn_num(hdev, BIS_LINK) || hci_conn_num(hdev, PA_LINK)) { __u16 handle = __le16_to_cpu(hci_acl_hdr(skb)->handle); __u8 type; type = hci_conn_lookup_type(hdev, hci_handle(handle)); if (type == CIS_LINK || type == BIS_LINK || type == PA_LINK) hci_skb_pkt_type(skb) = HCI_ISODATA_PKT; } break; case HCI_SCODATA_PKT: break; case HCI_ISODATA_PKT: break; case HCI_DRV_PKT: break; default: kfree_skb(skb); return -EINVAL; } /* Incoming skb */ bt_cb(skb)->incoming = 1; /* Time stamp */ __net_timestamp(skb); skb_queue_tail(&hdev->rx_q, skb); queue_work(hdev->workqueue, &hdev->rx_work); return 0; } EXPORT_SYMBOL(hci_recv_frame); /* Receive diagnostic message from HCI drivers */ int hci_recv_diag(struct hci_dev *hdev, struct sk_buff *skb) { /* Mark as diagnostic packet */ hci_skb_pkt_type(skb) = HCI_DIAG_PKT; /* Time stamp */ __net_timestamp(skb); skb_queue_tail(&hdev->rx_q, skb); queue_work(hdev->workqueue, &hdev->rx_work); return 0; } EXPORT_SYMBOL(hci_recv_diag); void hci_set_hw_info(struct hci_dev *hdev, const char *fmt, ...) { va_list vargs; va_start(vargs, fmt); kfree_const(hdev->hw_info); hdev->hw_info = kvasprintf_const(GFP_KERNEL, fmt, vargs); va_end(vargs); } EXPORT_SYMBOL(hci_set_hw_info); void hci_set_fw_info(struct hci_dev *hdev, const char *fmt, ...) { va_list vargs; va_start(vargs, fmt); kfree_const(hdev->fw_info); hdev->fw_info = kvasprintf_const(GFP_KERNEL, fmt, vargs); va_end(vargs); } EXPORT_SYMBOL(hci_set_fw_info); /* ---- Interface to upper protocols ---- */ int hci_register_cb(struct hci_cb *cb) { BT_DBG("%p name %s", cb, cb->name); mutex_lock(&hci_cb_list_lock); list_add_tail(&cb->list, &hci_cb_list); mutex_unlock(&hci_cb_list_lock); return 0; } EXPORT_SYMBOL(hci_register_cb); int hci_unregister_cb(struct hci_cb *cb) { BT_DBG("%p name %s", cb, cb->name); mutex_lock(&hci_cb_list_lock); list_del(&cb->list); mutex_unlock(&hci_cb_list_lock); return 0; } EXPORT_SYMBOL(hci_unregister_cb); static int hci_send_frame(struct hci_dev *hdev, struct sk_buff *skb) { int err; BT_DBG("%s type %d len %d", hdev->name, hci_skb_pkt_type(skb), skb->len); /* Time stamp */ __net_timestamp(skb); /* Send copy to monitor */ hci_send_to_monitor(hdev, skb); if (atomic_read(&hdev->promisc)) { /* Send copy to the sockets */ hci_send_to_sock(hdev, skb); } /* Get rid of skb owner, prior to sending to the driver. */ skb_orphan(skb); if (!test_bit(HCI_RUNNING, &hdev->flags)) { kfree_skb(skb); return -EINVAL; } if (hci_skb_pkt_type(skb) == HCI_DRV_PKT) { /* Intercept HCI Drv packet here and don't go with hdev->send * callback. */ err = hci_drv_process_cmd(hdev, skb); kfree_skb(skb); return err; } err = hdev->send(hdev, skb); if (err < 0) { bt_dev_err(hdev, "sending frame failed (%d)", err); kfree_skb(skb); return err; } return 0; } static int hci_send_conn_frame(struct hci_dev *hdev, struct hci_conn *conn, struct sk_buff *skb) { hci_conn_tx_queue(conn, skb); return hci_send_frame(hdev, skb); } /* Send HCI command */ int hci_send_cmd(struct hci_dev *hdev, __u16 opcode, __u32 plen, const void *param) { struct sk_buff *skb; BT_DBG("%s opcode 0x%4.4x plen %d", hdev->name, opcode, plen); skb = hci_cmd_sync_alloc(hdev, opcode, plen, param, NULL); if (!skb) { bt_dev_err(hdev, "no memory for command"); return -ENOMEM; } /* Stand-alone HCI commands must be flagged as * single-command requests. */ bt_cb(skb)->hci.req_flags |= HCI_REQ_START; skb_queue_tail(&hdev->cmd_q, skb); queue_work(hdev->workqueue, &hdev->cmd_work); return 0; } int __hci_cmd_send(struct hci_dev *hdev, u16 opcode, u32 plen, const void *param) { struct sk_buff *skb; if (hci_opcode_ogf(opcode) != 0x3f) { /* A controller receiving a command shall respond with either * a Command Status Event or a Command Complete Event. * Therefore, all standard HCI commands must be sent via the * standard API, using hci_send_cmd or hci_cmd_sync helpers. * Some vendors do not comply with this rule for vendor-specific * commands and do not return any event. We want to support * unresponded commands for such cases only. */ bt_dev_err(hdev, "unresponded command not supported"); return -EINVAL; } skb = hci_cmd_sync_alloc(hdev, opcode, plen, param, NULL); if (!skb) { bt_dev_err(hdev, "no memory for command (opcode 0x%4.4x)", opcode); return -ENOMEM; } hci_send_frame(hdev, skb); return 0; } EXPORT_SYMBOL(__hci_cmd_send); /* Get data from the previously sent command */ static void *hci_cmd_data(struct sk_buff *skb, __u16 opcode) { struct hci_command_hdr *hdr; if (!skb || skb->len < HCI_COMMAND_HDR_SIZE) return NULL; hdr = (void *)skb->data; if (hdr->opcode != cpu_to_le16(opcode)) return NULL; return skb->data + HCI_COMMAND_HDR_SIZE; } /* Get data from the previously sent command */ void *hci_sent_cmd_data(struct hci_dev *hdev, __u16 opcode) { void *data; /* Check if opcode matches last sent command */ data = hci_cmd_data(hdev->sent_cmd, opcode); if (!data) /* Check if opcode matches last request */ data = hci_cmd_data(hdev->req_skb, opcode); return data; } /* Get data from last received event */ void *hci_recv_event_data(struct hci_dev *hdev, __u8 event) { struct hci_event_hdr *hdr; int offset; if (!hdev->recv_event) return NULL; hdr = (void *)hdev->recv_event->data; offset = sizeof(*hdr); if (hdr->evt != event) { /* In case of LE metaevent check the subevent match */ if (hdr->evt == HCI_EV_LE_META) { struct hci_ev_le_meta *ev; ev = (void *)hdev->recv_event->data + offset; offset += sizeof(*ev); if (ev->subevent == event) goto found; } return NULL; } found: bt_dev_dbg(hdev, "event 0x%2.2x", event); return hdev->recv_event->data + offset; } /* Send ACL data */ static void hci_add_acl_hdr(struct sk_buff *skb, __u16 handle, __u16 flags) { struct hci_acl_hdr *hdr; int len = skb->len; skb_push(skb, HCI_ACL_HDR_SIZE); skb_reset_transport_header(skb); hdr = (struct hci_acl_hdr *)skb_transport_header(skb); hdr->handle = cpu_to_le16(hci_handle_pack(handle, flags)); hdr->dlen = cpu_to_le16(len); } static void hci_queue_acl(struct hci_chan *chan, struct sk_buff_head *queue, struct sk_buff *skb, __u16 flags) { struct hci_conn *conn = chan->conn; struct hci_dev *hdev = conn->hdev; struct sk_buff *list; skb->len = skb_headlen(skb); skb->data_len = 0; hci_skb_pkt_type(skb) = HCI_ACLDATA_PKT; hci_add_acl_hdr(skb, conn->handle, flags); list = skb_shinfo(skb)->frag_list; if (!list) { /* Non fragmented */ BT_DBG("%s nonfrag skb %p len %d", hdev->name, skb, skb->len); skb_queue_tail(queue, skb); } else { /* Fragmented */ BT_DBG("%s frag %p len %d", hdev->name, skb, skb->len); skb_shinfo(skb)->frag_list = NULL; /* Queue all fragments atomically. We need to use spin_lock_bh * here because of 6LoWPAN links, as there this function is * called from softirq and using normal spin lock could cause * deadlocks. */ spin_lock_bh(&queue->lock); __skb_queue_tail(queue, skb); flags &= ~ACL_START; flags |= ACL_CONT; do { skb = list; list = list->next; hci_skb_pkt_type(skb) = HCI_ACLDATA_PKT; hci_add_acl_hdr(skb, conn->handle, flags); BT_DBG("%s frag %p len %d", hdev->name, skb, skb->len); __skb_queue_tail(queue, skb); } while (list); spin_unlock_bh(&queue->lock); } bt_dev_dbg(hdev, "chan %p queued %d", chan, skb_queue_len(queue)); } void hci_send_acl(struct hci_chan *chan, struct sk_buff *skb, __u16 flags) { struct hci_dev *hdev = chan->conn->hdev; BT_DBG("%s chan %p flags 0x%4.4x", hdev->name, chan, flags); hci_queue_acl(chan, &chan->data_q, skb, flags); queue_work(hdev->workqueue, &hdev->tx_work); } /* Send SCO data */ void hci_send_sco(struct hci_conn *conn, struct sk_buff *skb) { struct hci_dev *hdev = conn->hdev; struct hci_sco_hdr hdr; BT_DBG("%s len %d", hdev->name, skb->len); hdr.handle = cpu_to_le16(conn->handle); hdr.dlen = skb->len; skb_push(skb, HCI_SCO_HDR_SIZE); skb_reset_transport_header(skb); memcpy(skb_transport_header(skb), &hdr, HCI_SCO_HDR_SIZE); hci_skb_pkt_type(skb) = HCI_SCODATA_PKT; skb_queue_tail(&conn->data_q, skb); bt_dev_dbg(hdev, "hcon %p queued %d", conn, skb_queue_len(&conn->data_q)); queue_work(hdev->workqueue, &hdev->tx_work); } /* Send ISO data */ static void hci_add_iso_hdr(struct sk_buff *skb, __u16 handle, __u8 flags) { struct hci_iso_hdr *hdr; int len = skb->len; skb_push(skb, HCI_ISO_HDR_SIZE); skb_reset_transport_header(skb); hdr = (struct hci_iso_hdr *)skb_transport_header(skb); hdr->handle = cpu_to_le16(hci_handle_pack(handle, flags)); hdr->dlen = cpu_to_le16(len); } static void hci_queue_iso(struct hci_conn *conn, struct sk_buff_head *queue, struct sk_buff *skb) { struct hci_dev *hdev = conn->hdev; struct sk_buff *list; __u16 flags; skb->len = skb_headlen(skb); skb->data_len = 0; hci_skb_pkt_type(skb) = HCI_ISODATA_PKT; list = skb_shinfo(skb)->frag_list; flags = hci_iso_flags_pack(list ? ISO_START : ISO_SINGLE, 0x00); hci_add_iso_hdr(skb, conn->handle, flags); if (!list) { /* Non fragmented */ BT_DBG("%s nonfrag skb %p len %d", hdev->name, skb, skb->len); skb_queue_tail(queue, skb); } else { /* Fragmented */ BT_DBG("%s frag %p len %d", hdev->name, skb, skb->len); skb_shinfo(skb)->frag_list = NULL; __skb_queue_tail(queue, skb); do { skb = list; list = list->next; hci_skb_pkt_type(skb) = HCI_ISODATA_PKT; flags = hci_iso_flags_pack(list ? ISO_CONT : ISO_END, 0x00); hci_add_iso_hdr(skb, conn->handle, flags); BT_DBG("%s frag %p len %d", hdev->name, skb, skb->len); __skb_queue_tail(queue, skb); } while (list); } bt_dev_dbg(hdev, "hcon %p queued %d", conn, skb_queue_len(queue)); } void hci_send_iso(struct hci_conn *conn, struct sk_buff *skb) { struct hci_dev *hdev = conn->hdev; BT_DBG("%s len %d", hdev->name, skb->len); hci_queue_iso(conn, &conn->data_q, skb); queue_work(hdev->workqueue, &hdev->tx_work); } /* ---- HCI TX task (outgoing data) ---- */ /* HCI Connection scheduler */ static inline void hci_quote_sent(struct hci_conn *conn, int num, int *quote) { struct hci_dev *hdev; int cnt, q; if (!conn) { *quote = 0; return; } hdev = conn->hdev; switch (conn->type) { case ACL_LINK: cnt = hdev->acl_cnt; break; case SCO_LINK: case ESCO_LINK: cnt = hdev->sco_cnt; break; case LE_LINK: cnt = hdev->le_mtu ? hdev->le_cnt : hdev->acl_cnt; break; case CIS_LINK: case BIS_LINK: case PA_LINK: cnt = hdev->iso_cnt; break; default: cnt = 0; bt_dev_err(hdev, "unknown link type %d", conn->type); } q = cnt / num; *quote = q ? q : 1; } static struct hci_conn *hci_low_sent(struct hci_dev *hdev, __u8 type, int *quote) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_conn *conn = NULL, *c; unsigned int num = 0, min = ~0; /* We don't have to lock device here. Connections are always * added and removed with TX task disabled. */ rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { if (c->type != type || skb_queue_empty(&c->data_q)) continue; bt_dev_dbg(hdev, "hcon %p state %s queued %d", c, state_to_string(c->state), skb_queue_len(&c->data_q)); if (c->state != BT_CONNECTED && c->state != BT_CONFIG) continue; num++; if (c->sent < min) { min = c->sent; conn = c; } if (hci_conn_num(hdev, type) == num) break; } rcu_read_unlock(); hci_quote_sent(conn, num, quote); BT_DBG("conn %p quote %d", conn, *quote); return conn; } static void hci_link_tx_to(struct hci_dev *hdev, __u8 type) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_conn *c; bt_dev_err(hdev, "link tx timeout"); hci_dev_lock(hdev); /* Kill stalled connections */ list_for_each_entry(c, &h->list, list) { if (c->type == type && c->sent) { bt_dev_err(hdev, "killing stalled connection %pMR", &c->dst); hci_disconnect(c, HCI_ERROR_REMOTE_USER_TERM); } } hci_dev_unlock(hdev); } static struct hci_chan *hci_chan_sent(struct hci_dev *hdev, __u8 type, int *quote) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_chan *chan = NULL; unsigned int num = 0, min = ~0, cur_prio = 0; struct hci_conn *conn; int conn_num = 0; BT_DBG("%s", hdev->name); rcu_read_lock(); list_for_each_entry_rcu(conn, &h->list, list) { struct hci_chan *tmp; if (conn->type != type) continue; if (conn->state != BT_CONNECTED && conn->state != BT_CONFIG) continue; conn_num++; list_for_each_entry_rcu(tmp, &conn->chan_list, list) { struct sk_buff *skb; if (skb_queue_empty(&tmp->data_q)) continue; skb = skb_peek(&tmp->data_q); if (skb->priority < cur_prio) continue; if (skb->priority > cur_prio) { num = 0; min = ~0; cur_prio = skb->priority; } num++; if (conn->sent < min) { min = conn->sent; chan = tmp; } } if (hci_conn_num(hdev, type) == conn_num) break; } rcu_read_unlock(); if (!chan) return NULL; hci_quote_sent(chan->conn, num, quote); BT_DBG("chan %p quote %d", chan, *quote); return chan; } static void hci_prio_recalculate(struct hci_dev *hdev, __u8 type) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_conn *conn; int num = 0; BT_DBG("%s", hdev->name); rcu_read_lock(); list_for_each_entry_rcu(conn, &h->list, list) { struct hci_chan *chan; if (conn->type != type) continue; if (conn->state != BT_CONNECTED && conn->state != BT_CONFIG) continue; num++; list_for_each_entry_rcu(chan, &conn->chan_list, list) { struct sk_buff *skb; if (chan->sent) { chan->sent = 0; continue; } if (skb_queue_empty(&chan->data_q)) continue; skb = skb_peek(&chan->data_q); if (skb->priority >= HCI_PRIO_MAX - 1) continue; skb->priority = HCI_PRIO_MAX - 1; BT_DBG("chan %p skb %p promoted to %d", chan, skb, skb->priority); } if (hci_conn_num(hdev, type) == num) break; } rcu_read_unlock(); } static void __check_timeout(struct hci_dev *hdev, unsigned int cnt, u8 type) { unsigned long timeout; if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) return; switch (type) { case ACL_LINK: /* tx timeout must be longer than maximum link supervision * timeout (40.9 seconds) */ timeout = hdev->acl_last_tx + HCI_ACL_TX_TIMEOUT; break; case LE_LINK: /* tx timeout must be longer than maximum link supervision * timeout (40.9 seconds) */ timeout = hdev->le_last_tx + HCI_ACL_TX_TIMEOUT; break; case CIS_LINK: case BIS_LINK: case PA_LINK: /* tx timeout must be longer than the maximum transport latency * (8.388607 seconds) */ timeout = hdev->iso_last_tx + HCI_ISO_TX_TIMEOUT; break; default: return; } if (!cnt && time_after(jiffies, timeout)) hci_link_tx_to(hdev, type); } /* Schedule SCO */ static void hci_sched_sco(struct hci_dev *hdev, __u8 type) { struct hci_conn *conn; struct sk_buff *skb; int quote, *cnt; unsigned int pkts = hdev->sco_pkts; bt_dev_dbg(hdev, "type %u", type); if (!hci_conn_num(hdev, type) || !pkts) return; /* Use sco_pkts if flow control has not been enabled which will limit * the amount of buffer sent in a row. */ if (!hci_dev_test_flag(hdev, HCI_SCO_FLOWCTL)) cnt = &pkts; else cnt = &hdev->sco_cnt; while (*cnt && (conn = hci_low_sent(hdev, type, &quote))) { while (quote-- && (skb = skb_dequeue(&conn->data_q))) { BT_DBG("skb %p len %d", skb, skb->len); hci_send_conn_frame(hdev, conn, skb); conn->sent++; if (conn->sent == ~0) conn->sent = 0; (*cnt)--; } } /* Rescheduled if all packets were sent and flow control is not enabled * as there could be more packets queued that could not be sent and * since no HCI_EV_NUM_COMP_PKTS event will be generated the reschedule * needs to be forced. */ if (!pkts && !hci_dev_test_flag(hdev, HCI_SCO_FLOWCTL)) queue_work(hdev->workqueue, &hdev->tx_work); } static void hci_sched_acl_pkt(struct hci_dev *hdev) { unsigned int cnt = hdev->acl_cnt; struct hci_chan *chan; struct sk_buff *skb; int quote; __check_timeout(hdev, cnt, ACL_LINK); while (hdev->acl_cnt && (chan = hci_chan_sent(hdev, ACL_LINK, &quote))) { u32 priority = (skb_peek(&chan->data_q))->priority; while (quote-- && (skb = skb_peek(&chan->data_q))) { BT_DBG("chan %p skb %p len %d priority %u", chan, skb, skb->len, skb->priority); /* Stop if priority has changed */ if (skb->priority < priority) break; skb = skb_dequeue(&chan->data_q); hci_conn_enter_active_mode(chan->conn, bt_cb(skb)->force_active); hci_send_conn_frame(hdev, chan->conn, skb); hdev->acl_last_tx = jiffies; hdev->acl_cnt--; chan->sent++; chan->conn->sent++; /* Send pending SCO packets right away */ hci_sched_sco(hdev, SCO_LINK); hci_sched_sco(hdev, ESCO_LINK); } } if (cnt != hdev->acl_cnt) hci_prio_recalculate(hdev, ACL_LINK); } static void hci_sched_acl(struct hci_dev *hdev) { BT_DBG("%s", hdev->name); /* No ACL link over BR/EDR controller */ if (!hci_conn_num(hdev, ACL_LINK)) return; hci_sched_acl_pkt(hdev); } static void hci_sched_le(struct hci_dev *hdev) { struct hci_chan *chan; struct sk_buff *skb; int quote, *cnt, tmp; BT_DBG("%s", hdev->name); if (!hci_conn_num(hdev, LE_LINK)) return; cnt = hdev->le_pkts ? &hdev->le_cnt : &hdev->acl_cnt; __check_timeout(hdev, *cnt, LE_LINK); tmp = *cnt; while (*cnt && (chan = hci_chan_sent(hdev, LE_LINK, &quote))) { u32 priority = (skb_peek(&chan->data_q))->priority; while (quote-- && (skb = skb_peek(&chan->data_q))) { BT_DBG("chan %p skb %p len %d priority %u", chan, skb, skb->len, skb->priority); /* Stop if priority has changed */ if (skb->priority < priority) break; skb = skb_dequeue(&chan->data_q); hci_send_conn_frame(hdev, chan->conn, skb); hdev->le_last_tx = jiffies; (*cnt)--; chan->sent++; chan->conn->sent++; /* Send pending SCO packets right away */ hci_sched_sco(hdev, SCO_LINK); hci_sched_sco(hdev, ESCO_LINK); } } if (*cnt != tmp) hci_prio_recalculate(hdev, LE_LINK); } /* Schedule iso */ static void hci_sched_iso(struct hci_dev *hdev, __u8 type) { struct hci_conn *conn; struct sk_buff *skb; int quote, *cnt; BT_DBG("%s", hdev->name); if (!hci_conn_num(hdev, type)) return; cnt = &hdev->iso_cnt; __check_timeout(hdev, *cnt, type); while (*cnt && (conn = hci_low_sent(hdev, type, &quote))) { while (quote-- && (skb = skb_dequeue(&conn->data_q))) { BT_DBG("skb %p len %d", skb, skb->len); hci_send_conn_frame(hdev, conn, skb); hdev->iso_last_tx = jiffies; conn->sent++; if (conn->sent == ~0) conn->sent = 0; (*cnt)--; } } } static void hci_tx_work(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, tx_work); struct sk_buff *skb; BT_DBG("%s acl %d sco %d le %d iso %d", hdev->name, hdev->acl_cnt, hdev->sco_cnt, hdev->le_cnt, hdev->iso_cnt); if (!hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { /* Schedule queues and send stuff to HCI driver */ hci_sched_sco(hdev, SCO_LINK); hci_sched_sco(hdev, ESCO_LINK); hci_sched_iso(hdev, CIS_LINK); hci_sched_iso(hdev, BIS_LINK); hci_sched_iso(hdev, PA_LINK); hci_sched_acl(hdev); hci_sched_le(hdev); } /* Send next queued raw (unknown type) packet */ while ((skb = skb_dequeue(&hdev->raw_q))) hci_send_frame(hdev, skb); } /* ----- HCI RX task (incoming data processing) ----- */ /* ACL data packet */ static void hci_acldata_packet(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_acl_hdr *hdr; __u16 handle, flags; int err; hdr = skb_pull_data(skb, sizeof(*hdr)); if (!hdr) { bt_dev_err(hdev, "ACL packet too small"); kfree_skb(skb); return; } handle = __le16_to_cpu(hdr->handle); flags = hci_flags(handle); handle = hci_handle(handle); bt_dev_dbg(hdev, "len %d handle 0x%4.4x flags 0x%4.4x", skb->len, handle, flags); hdev->stat.acl_rx++; err = l2cap_recv_acldata(hdev, handle, skb, flags); if (err == -ENOENT) bt_dev_err(hdev, "ACL packet for unknown connection handle %d", handle); else if (err) bt_dev_dbg(hdev, "ACL packet recv for handle %d failed: %d", handle, err); } /* SCO data packet */ static void hci_scodata_packet(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_sco_hdr *hdr; __u16 handle, flags; int err; hdr = skb_pull_data(skb, sizeof(*hdr)); if (!hdr) { bt_dev_err(hdev, "SCO packet too small"); kfree_skb(skb); return; } handle = __le16_to_cpu(hdr->handle); flags = hci_flags(handle); handle = hci_handle(handle); bt_dev_dbg(hdev, "len %d handle 0x%4.4x flags 0x%4.4x", skb->len, handle, flags); hdev->stat.sco_rx++; hci_skb_pkt_status(skb) = flags & 0x03; err = sco_recv_scodata(hdev, handle, skb); if (err == -ENOENT) bt_dev_err_ratelimited(hdev, "SCO packet for unknown connection handle %d", handle); else if (err) bt_dev_dbg(hdev, "SCO packet recv for handle %d failed: %d", handle, err); } static void hci_isodata_packet(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_iso_hdr *hdr; __u16 handle, flags; int err; hdr = skb_pull_data(skb, sizeof(*hdr)); if (!hdr) { bt_dev_err(hdev, "ISO packet too small"); kfree_skb(skb); return; } handle = __le16_to_cpu(hdr->handle); flags = hci_flags(handle); handle = hci_handle(handle); bt_dev_dbg(hdev, "len %d handle 0x%4.4x flags 0x%4.4x", skb->len, handle, flags); err = iso_recv(hdev, handle, skb, flags); if (err == -ENOENT) bt_dev_err(hdev, "ISO packet for unknown connection handle %d", handle); else if (err) bt_dev_dbg(hdev, "ISO packet recv for handle %d failed: %d", handle, err); } static bool hci_req_is_complete(struct hci_dev *hdev) { struct sk_buff *skb; skb = skb_peek(&hdev->cmd_q); if (!skb) return true; return (bt_cb(skb)->hci.req_flags & HCI_REQ_START); } static void hci_resend_last(struct hci_dev *hdev) { struct hci_command_hdr *sent; struct sk_buff *skb; u16 opcode; if (!hdev->sent_cmd) return; sent = (void *) hdev->sent_cmd->data; opcode = __le16_to_cpu(sent->opcode); if (opcode == HCI_OP_RESET) return; skb = skb_clone(hdev->sent_cmd, GFP_KERNEL); if (!skb) return; skb_queue_head(&hdev->cmd_q, skb); queue_work(hdev->workqueue, &hdev->cmd_work); } void hci_req_cmd_complete(struct hci_dev *hdev, u16 opcode, u8 status, hci_req_complete_t *req_complete, hci_req_complete_skb_t *req_complete_skb) { struct sk_buff *skb; unsigned long flags; BT_DBG("opcode 0x%04x status 0x%02x", opcode, status); /* If the completed command doesn't match the last one that was * sent we need to do special handling of it. */ if (!hci_sent_cmd_data(hdev, opcode)) { /* Some CSR based controllers generate a spontaneous * reset complete event during init and any pending * command will never be completed. In such a case we * need to resend whatever was the last sent * command. */ if (test_bit(HCI_INIT, &hdev->flags) && opcode == HCI_OP_RESET) hci_resend_last(hdev); return; } /* If we reach this point this event matches the last command sent */ hci_dev_clear_flag(hdev, HCI_CMD_PENDING); /* If the command succeeded and there's still more commands in * this request the request is not yet complete. */ if (!status && !hci_req_is_complete(hdev)) return; skb = hdev->req_skb; /* If this was the last command in a request the complete * callback would be found in hdev->req_skb instead of the * command queue (hdev->cmd_q). */ if (skb && bt_cb(skb)->hci.req_flags & HCI_REQ_SKB) { *req_complete_skb = bt_cb(skb)->hci.req_complete_skb; return; } if (skb && bt_cb(skb)->hci.req_complete) { *req_complete = bt_cb(skb)->hci.req_complete; return; } /* Remove all pending commands belonging to this request */ spin_lock_irqsave(&hdev->cmd_q.lock, flags); while ((skb = __skb_dequeue(&hdev->cmd_q))) { if (bt_cb(skb)->hci.req_flags & HCI_REQ_START) { __skb_queue_head(&hdev->cmd_q, skb); break; } if (bt_cb(skb)->hci.req_flags & HCI_REQ_SKB) *req_complete_skb = bt_cb(skb)->hci.req_complete_skb; else *req_complete = bt_cb(skb)->hci.req_complete; dev_kfree_skb_irq(skb); } spin_unlock_irqrestore(&hdev->cmd_q.lock, flags); } static void hci_rx_work(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, rx_work); struct sk_buff *skb; BT_DBG("%s", hdev->name); /* The kcov_remote functions used for collecting packet parsing * coverage information from this background thread and associate * the coverage with the syscall's thread which originally injected * the packet. This helps fuzzing the kernel. */ for (; (skb = skb_dequeue(&hdev->rx_q)); kcov_remote_stop()) { kcov_remote_start_common(skb_get_kcov_handle(skb)); /* Send copy to monitor */ hci_send_to_monitor(hdev, skb); if (atomic_read(&hdev->promisc)) { /* Send copy to the sockets */ hci_send_to_sock(hdev, skb); } /* If the device has been opened in HCI_USER_CHANNEL, * the userspace has exclusive access to device. * When device is HCI_INIT, we still need to process * the data packets to the driver in order * to complete its setup(). */ if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && !test_bit(HCI_INIT, &hdev->flags)) { kfree_skb(skb); continue; } if (test_bit(HCI_INIT, &hdev->flags)) { /* Don't process data packets in this states. */ switch (hci_skb_pkt_type(skb)) { case HCI_ACLDATA_PKT: case HCI_SCODATA_PKT: case HCI_ISODATA_PKT: kfree_skb(skb); continue; } } /* Process frame */ switch (hci_skb_pkt_type(skb)) { case HCI_EVENT_PKT: BT_DBG("%s Event packet", hdev->name); hci_event_packet(hdev, skb); break; case HCI_ACLDATA_PKT: BT_DBG("%s ACL data packet", hdev->name); hci_acldata_packet(hdev, skb); break; case HCI_SCODATA_PKT: BT_DBG("%s SCO data packet", hdev->name); hci_scodata_packet(hdev, skb); break; case HCI_ISODATA_PKT: BT_DBG("%s ISO data packet", hdev->name); hci_isodata_packet(hdev, skb); break; default: kfree_skb(skb); break; } } } static int hci_send_cmd_sync(struct hci_dev *hdev, struct sk_buff *skb) { int err; bt_dev_dbg(hdev, "skb %p", skb); kfree_skb(hdev->sent_cmd); hdev->sent_cmd = skb_clone(skb, GFP_KERNEL); if (!hdev->sent_cmd) { skb_queue_head(&hdev->cmd_q, skb); queue_work(hdev->workqueue, &hdev->cmd_work); return -EINVAL; } if (hci_skb_opcode(skb) != HCI_OP_NOP) { err = hci_send_frame(hdev, skb); if (err < 0) { hci_cmd_sync_cancel_sync(hdev, -err); return err; } atomic_dec(&hdev->cmd_cnt); } else { err = -ENODATA; kfree_skb(skb); } if (hdev->req_status == HCI_REQ_PEND && !hci_dev_test_and_set_flag(hdev, HCI_CMD_PENDING)) { kfree_skb(hdev->req_skb); hdev->req_skb = skb_clone(hdev->sent_cmd, GFP_KERNEL); } return err; } static void hci_cmd_work(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, cmd_work); struct sk_buff *skb; int err; BT_DBG("%s cmd_cnt %d cmd queued %d", hdev->name, atomic_read(&hdev->cmd_cnt), skb_queue_len(&hdev->cmd_q)); /* Send queued commands */ if (atomic_read(&hdev->cmd_cnt)) { skb = skb_dequeue(&hdev->cmd_q); if (!skb) return; err = hci_send_cmd_sync(hdev, skb); if (err) return; rcu_read_lock(); if (test_bit(HCI_RESET, &hdev->flags) || hci_dev_test_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE)) cancel_delayed_work(&hdev->cmd_timer); else queue_delayed_work(hdev->workqueue, &hdev->cmd_timer, HCI_CMD_TIMEOUT); rcu_read_unlock(); } }
324 325 210 114 325 322 323 323 325 325 173 76 118 118 172 171 19 170 168 161 161 10 161 161 321 321 322 322 320 49 322 321 34 40 243 47 48 321 322 322 58 322 35 321 45 324 68 2 2 2 2 2 329 329 305 302 161 39 119 12 119 45 45 108 108 43 107 107 39 105 37 108 77 43 2 2 2 2 2 292 291 41 327 324 212 321 324 173 174 174 174 2 14 9 14 2 3 13 15 15 15 39 39 39 39 6 39 323 38 50 1 37 13 36 13 50 170 171 171 171 170 170 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001 Intel Corp. * Copyright (c) 2001 La Monte H.P. Yarroll * * This file is part of the SCTP kernel implementation * * This module provides the abstraction for an SCTP association. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * Jon Grimm <jgrimm@us.ibm.com> * Xingang Guo <xingang.guo@intel.com> * Hui Huang <hui.huang@nokia.com> * Sridhar Samudrala <sri@us.ibm.com> * Daisy Chang <daisyc@us.ibm.com> * Ryan Layer <rmlayer@us.ibm.com> * Kevin Gao <kevin.gao@intel.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/types.h> #include <linux/fcntl.h> #include <linux/poll.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/in.h> #include <net/ipv6.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> /* Forward declarations for internal functions. */ static void sctp_select_active_and_retran_path(struct sctp_association *asoc); static void sctp_assoc_bh_rcv(struct work_struct *work); static void sctp_assoc_free_asconf_acks(struct sctp_association *asoc); static void sctp_assoc_free_asconf_queue(struct sctp_association *asoc); /* 1st Level Abstractions. */ /* Initialize a new association from provided memory. */ static struct sctp_association *sctp_association_init( struct sctp_association *asoc, const struct sctp_endpoint *ep, const struct sock *sk, enum sctp_scope scope, gfp_t gfp) { struct sctp_sock *sp; struct sctp_paramhdr *p; int i; /* Retrieve the SCTP per socket area. */ sp = sctp_sk((struct sock *)sk); /* Discarding const is appropriate here. */ asoc->ep = (struct sctp_endpoint *)ep; asoc->base.sk = (struct sock *)sk; asoc->base.net = sock_net(sk); sctp_endpoint_hold(asoc->ep); sock_hold(asoc->base.sk); /* Initialize the common base substructure. */ asoc->base.type = SCTP_EP_TYPE_ASSOCIATION; /* Initialize the object handling fields. */ refcount_set(&asoc->base.refcnt, 1); /* Initialize the bind addr area. */ sctp_bind_addr_init(&asoc->base.bind_addr, ep->base.bind_addr.port); asoc->state = SCTP_STATE_CLOSED; asoc->cookie_life = ms_to_ktime(sp->assocparams.sasoc_cookie_life); asoc->user_frag = sp->user_frag; /* Set the association max_retrans and RTO values from the * socket values. */ asoc->max_retrans = sp->assocparams.sasoc_asocmaxrxt; asoc->pf_retrans = sp->pf_retrans; asoc->ps_retrans = sp->ps_retrans; asoc->pf_expose = sp->pf_expose; asoc->rto_initial = msecs_to_jiffies(sp->rtoinfo.srto_initial); asoc->rto_max = msecs_to_jiffies(sp->rtoinfo.srto_max); asoc->rto_min = msecs_to_jiffies(sp->rtoinfo.srto_min); /* Initialize the association's heartbeat interval based on the * sock configured value. */ asoc->hbinterval = msecs_to_jiffies(sp->hbinterval); asoc->probe_interval = msecs_to_jiffies(sp->probe_interval); asoc->encap_port = sp->encap_port; /* Initialize path max retrans value. */ asoc->pathmaxrxt = sp->pathmaxrxt; asoc->flowlabel = sp->flowlabel; asoc->dscp = sp->dscp; /* Set association default SACK delay */ asoc->sackdelay = msecs_to_jiffies(sp->sackdelay); asoc->sackfreq = sp->sackfreq; /* Set the association default flags controlling * Heartbeat, SACK delay, and Path MTU Discovery. */ asoc->param_flags = sp->param_flags; /* Initialize the maximum number of new data packets that can be sent * in a burst. */ asoc->max_burst = sp->max_burst; asoc->subscribe = sp->subscribe; /* initialize association timers */ asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] = asoc->rto_initial; asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] = asoc->rto_initial; asoc->timeouts[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] = asoc->rto_initial; /* sctpimpguide Section 2.12.2 * If the 'T5-shutdown-guard' timer is used, it SHOULD be set to the * recommended value of 5 times 'RTO.Max'. */ asoc->timeouts[SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD] = 5 * asoc->rto_max; asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = asoc->sackdelay; asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = (unsigned long)sp->autoclose * HZ; /* Initializes the timers */ for (i = SCTP_EVENT_TIMEOUT_NONE; i < SCTP_NUM_TIMEOUT_TYPES; ++i) timer_setup(&asoc->timers[i], sctp_timer_events[i], 0); /* Pull default initialization values from the sock options. * Note: This assumes that the values have already been * validated in the sock. */ asoc->c.sinit_max_instreams = sp->initmsg.sinit_max_instreams; asoc->c.sinit_num_ostreams = sp->initmsg.sinit_num_ostreams; asoc->max_init_attempts = sp->initmsg.sinit_max_attempts; asoc->max_init_timeo = msecs_to_jiffies(sp->initmsg.sinit_max_init_timeo); /* Set the local window size for receive. * This is also the rcvbuf space per association. * RFC 6 - A SCTP receiver MUST be able to receive a minimum of * 1500 bytes in one SCTP packet. */ if ((sk->sk_rcvbuf/2) < SCTP_DEFAULT_MINWINDOW) asoc->rwnd = SCTP_DEFAULT_MINWINDOW; else asoc->rwnd = sk->sk_rcvbuf/2; asoc->a_rwnd = asoc->rwnd; /* Use my own max window until I learn something better. */ asoc->peer.rwnd = SCTP_DEFAULT_MAXWINDOW; /* Initialize the receive memory counter */ atomic_set(&asoc->rmem_alloc, 0); init_waitqueue_head(&asoc->wait); asoc->c.my_vtag = sctp_generate_tag(ep); asoc->c.my_port = ep->base.bind_addr.port; asoc->c.initial_tsn = sctp_generate_tsn(ep); asoc->next_tsn = asoc->c.initial_tsn; asoc->ctsn_ack_point = asoc->next_tsn - 1; asoc->adv_peer_ack_point = asoc->ctsn_ack_point; asoc->highest_sacked = asoc->ctsn_ack_point; asoc->last_cwr_tsn = asoc->ctsn_ack_point; /* ADDIP Section 4.1 Asconf Chunk Procedures * * When an endpoint has an ASCONF signaled change to be sent to the * remote endpoint it should do the following: * ... * A2) a serial number should be assigned to the chunk. The serial * number SHOULD be a monotonically increasing number. The serial * numbers SHOULD be initialized at the start of the * association to the same value as the initial TSN. */ asoc->addip_serial = asoc->c.initial_tsn; asoc->strreset_outseq = asoc->c.initial_tsn; INIT_LIST_HEAD(&asoc->addip_chunk_list); INIT_LIST_HEAD(&asoc->asconf_ack_list); /* Make an empty list of remote transport addresses. */ INIT_LIST_HEAD(&asoc->peer.transport_addr_list); /* RFC 2960 5.1 Normal Establishment of an Association * * After the reception of the first data chunk in an * association the endpoint must immediately respond with a * sack to acknowledge the data chunk. Subsequent * acknowledgements should be done as described in Section * 6.2. * * [We implement this by telling a new association that it * already received one packet.] */ asoc->peer.sack_needed = 1; asoc->peer.sack_generation = 1; /* Create an input queue. */ sctp_inq_init(&asoc->base.inqueue); sctp_inq_set_th_handler(&asoc->base.inqueue, sctp_assoc_bh_rcv); /* Create an output queue. */ sctp_outq_init(asoc, &asoc->outqueue); sctp_ulpq_init(&asoc->ulpq, asoc); if (sctp_stream_init(&asoc->stream, asoc->c.sinit_num_ostreams, 0, gfp)) goto stream_free; /* Initialize default path MTU. */ asoc->pathmtu = sp->pathmtu; sctp_assoc_update_frag_point(asoc); /* Assume that peer would support both address types unless we are * told otherwise. */ asoc->peer.ipv4_address = 1; if (asoc->base.sk->sk_family == PF_INET6) asoc->peer.ipv6_address = 1; INIT_LIST_HEAD(&asoc->asocs); asoc->default_stream = sp->default_stream; asoc->default_ppid = sp->default_ppid; asoc->default_flags = sp->default_flags; asoc->default_context = sp->default_context; asoc->default_timetolive = sp->default_timetolive; asoc->default_rcv_context = sp->default_rcv_context; /* AUTH related initializations */ INIT_LIST_HEAD(&asoc->endpoint_shared_keys); if (sctp_auth_asoc_copy_shkeys(ep, asoc, gfp)) goto stream_free; asoc->active_key_id = ep->active_key_id; asoc->strreset_enable = ep->strreset_enable; /* Save the hmacs and chunks list into this association */ if (ep->auth_hmacs_list) memcpy(asoc->c.auth_hmacs, ep->auth_hmacs_list, ntohs(ep->auth_hmacs_list->param_hdr.length)); if (ep->auth_chunk_list) memcpy(asoc->c.auth_chunks, ep->auth_chunk_list, ntohs(ep->auth_chunk_list->param_hdr.length)); /* Get the AUTH random number for this association */ p = (struct sctp_paramhdr *)asoc->c.auth_random; p->type = SCTP_PARAM_RANDOM; p->length = htons(sizeof(*p) + SCTP_AUTH_RANDOM_LENGTH); get_random_bytes(p+1, SCTP_AUTH_RANDOM_LENGTH); return asoc; stream_free: sctp_stream_free(&asoc->stream); sock_put(asoc->base.sk); sctp_endpoint_put(asoc->ep); return NULL; } /* Allocate and initialize a new association */ struct sctp_association *sctp_association_new(const struct sctp_endpoint *ep, const struct sock *sk, enum sctp_scope scope, gfp_t gfp) { struct sctp_association *asoc; asoc = kzalloc(sizeof(*asoc), gfp); if (!asoc) goto fail; if (!sctp_association_init(asoc, ep, sk, scope, gfp)) goto fail_init; SCTP_DBG_OBJCNT_INC(assoc); pr_debug("Created asoc %p\n", asoc); return asoc; fail_init: kfree(asoc); fail: return NULL; } /* Free this association if possible. There may still be users, so * the actual deallocation may be delayed. */ void sctp_association_free(struct sctp_association *asoc) { struct sock *sk = asoc->base.sk; struct sctp_transport *transport; struct list_head *pos, *temp; int i; /* Only real associations count against the endpoint, so * don't bother for if this is a temporary association. */ if (!list_empty(&asoc->asocs)) { list_del(&asoc->asocs); /* Decrement the backlog value for a TCP-style listening * socket. */ if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING)) sk_acceptq_removed(sk); } /* Mark as dead, so other users can know this structure is * going away. */ asoc->base.dead = true; /* Dispose of any data lying around in the outqueue. */ sctp_outq_free(&asoc->outqueue); /* Dispose of any pending messages for the upper layer. */ sctp_ulpq_free(&asoc->ulpq); /* Dispose of any pending chunks on the inqueue. */ sctp_inq_free(&asoc->base.inqueue); sctp_tsnmap_free(&asoc->peer.tsn_map); /* Free stream information. */ sctp_stream_free(&asoc->stream); if (asoc->strreset_chunk) sctp_chunk_free(asoc->strreset_chunk); /* Clean up the bound address list. */ sctp_bind_addr_free(&asoc->base.bind_addr); /* Do we need to go through all of our timers and * delete them? To be safe we will try to delete all, but we * should be able to go through and make a guess based * on our state. */ for (i = SCTP_EVENT_TIMEOUT_NONE; i < SCTP_NUM_TIMEOUT_TYPES; ++i) { if (timer_delete(&asoc->timers[i])) sctp_association_put(asoc); } /* Free peer's cached cookie. */ kfree(asoc->peer.cookie); kfree(asoc->peer.peer_random); kfree(asoc->peer.peer_chunks); kfree(asoc->peer.peer_hmacs); /* Release the transport structures. */ list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) { transport = list_entry(pos, struct sctp_transport, transports); list_del_rcu(pos); sctp_unhash_transport(transport); sctp_transport_free(transport); } asoc->peer.transport_count = 0; sctp_asconf_queue_teardown(asoc); /* Free pending address space being deleted */ kfree(asoc->asconf_addr_del_pending); /* AUTH - Free the endpoint shared keys */ sctp_auth_destroy_keys(&asoc->endpoint_shared_keys); /* AUTH - Free the association shared key */ sctp_auth_key_put(asoc->asoc_shared_key); sctp_association_put(asoc); } /* Cleanup and free up an association. */ static void sctp_association_destroy(struct sctp_association *asoc) { if (unlikely(!asoc->base.dead)) { WARN(1, "Attempt to destroy undead association %p!\n", asoc); return; } sctp_endpoint_put(asoc->ep); sock_put(asoc->base.sk); if (asoc->assoc_id != 0) { spin_lock_bh(&sctp_assocs_id_lock); idr_remove(&sctp_assocs_id, asoc->assoc_id); spin_unlock_bh(&sctp_assocs_id_lock); } WARN_ON(atomic_read(&asoc->rmem_alloc)); kfree_rcu(asoc, rcu); SCTP_DBG_OBJCNT_DEC(assoc); } /* Change the primary destination address for the peer. */ void sctp_assoc_set_primary(struct sctp_association *asoc, struct sctp_transport *transport) { int changeover = 0; /* it's a changeover only if we already have a primary path * that we are changing */ if (asoc->peer.primary_path != NULL && asoc->peer.primary_path != transport) changeover = 1 ; asoc->peer.primary_path = transport; sctp_ulpevent_notify_peer_addr_change(transport, SCTP_ADDR_MADE_PRIM, 0); /* Set a default msg_name for events. */ memcpy(&asoc->peer.primary_addr, &transport->ipaddr, sizeof(union sctp_addr)); /* If the primary path is changing, assume that the * user wants to use this new path. */ if ((transport->state == SCTP_ACTIVE) || (transport->state == SCTP_UNKNOWN)) asoc->peer.active_path = transport; /* * SFR-CACC algorithm: * Upon the receipt of a request to change the primary * destination address, on the data structure for the new * primary destination, the sender MUST do the following: * * 1) If CHANGEOVER_ACTIVE is set, then there was a switch * to this destination address earlier. The sender MUST set * CYCLING_CHANGEOVER to indicate that this switch is a * double switch to the same destination address. * * Really, only bother is we have data queued or outstanding on * the association. */ if (!asoc->outqueue.outstanding_bytes && !asoc->outqueue.out_qlen) return; if (transport->cacc.changeover_active) transport->cacc.cycling_changeover = changeover; /* 2) The sender MUST set CHANGEOVER_ACTIVE to indicate that * a changeover has occurred. */ transport->cacc.changeover_active = changeover; /* 3) The sender MUST store the next TSN to be sent in * next_tsn_at_change. */ transport->cacc.next_tsn_at_change = asoc->next_tsn; } /* Remove a transport from an association. */ void sctp_assoc_rm_peer(struct sctp_association *asoc, struct sctp_transport *peer) { struct sctp_transport *transport; struct list_head *pos; struct sctp_chunk *ch; pr_debug("%s: association:%p addr:%pISpc\n", __func__, asoc, &peer->ipaddr.sa); /* If we are to remove the current retran_path, update it * to the next peer before removing this peer from the list. */ if (asoc->peer.retran_path == peer) sctp_assoc_update_retran_path(asoc); /* Remove this peer from the list. */ list_del_rcu(&peer->transports); /* Remove this peer from the transport hashtable */ sctp_unhash_transport(peer); /* Get the first transport of asoc. */ pos = asoc->peer.transport_addr_list.next; transport = list_entry(pos, struct sctp_transport, transports); /* Update any entries that match the peer to be deleted. */ if (asoc->peer.primary_path == peer) sctp_assoc_set_primary(asoc, transport); if (asoc->peer.active_path == peer) asoc->peer.active_path = transport; if (asoc->peer.retran_path == peer) asoc->peer.retran_path = transport; if (asoc->peer.last_data_from == peer) asoc->peer.last_data_from = transport; if (asoc->strreset_chunk && asoc->strreset_chunk->transport == peer) { asoc->strreset_chunk->transport = transport; sctp_transport_reset_reconf_timer(transport); } /* If we remove the transport an INIT was last sent to, set it to * NULL. Combined with the update of the retran path above, this * will cause the next INIT to be sent to the next available * transport, maintaining the cycle. */ if (asoc->init_last_sent_to == peer) asoc->init_last_sent_to = NULL; /* If we remove the transport an SHUTDOWN was last sent to, set it * to NULL. Combined with the update of the retran path above, this * will cause the next SHUTDOWN to be sent to the next available * transport, maintaining the cycle. */ if (asoc->shutdown_last_sent_to == peer) asoc->shutdown_last_sent_to = NULL; /* If we remove the transport an ASCONF was last sent to, set it to * NULL. */ if (asoc->addip_last_asconf && asoc->addip_last_asconf->transport == peer) asoc->addip_last_asconf->transport = NULL; /* If we have something on the transmitted list, we have to * save it off. The best place is the active path. */ if (!list_empty(&peer->transmitted)) { struct sctp_transport *active = asoc->peer.active_path; /* Reset the transport of each chunk on this list */ list_for_each_entry(ch, &peer->transmitted, transmitted_list) { ch->transport = NULL; ch->rtt_in_progress = 0; } list_splice_tail_init(&peer->transmitted, &active->transmitted); /* Start a T3 timer here in case it wasn't running so * that these migrated packets have a chance to get * retransmitted. */ if (!timer_pending(&active->T3_rtx_timer)) if (!mod_timer(&active->T3_rtx_timer, jiffies + active->rto)) sctp_transport_hold(active); } list_for_each_entry(ch, &asoc->outqueue.out_chunk_list, list) if (ch->transport == peer) ch->transport = NULL; asoc->peer.transport_count--; sctp_ulpevent_notify_peer_addr_change(peer, SCTP_ADDR_REMOVED, 0); sctp_transport_free(peer); } /* Add a transport address to an association. */ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, const union sctp_addr *addr, const gfp_t gfp, const int peer_state) { struct sctp_transport *peer; struct sctp_sock *sp; unsigned short port; sp = sctp_sk(asoc->base.sk); /* AF_INET and AF_INET6 share common port field. */ port = ntohs(addr->v4.sin_port); pr_debug("%s: association:%p addr:%pISpc state:%d\n", __func__, asoc, &addr->sa, peer_state); /* Set the port if it has not been set yet. */ if (0 == asoc->peer.port) asoc->peer.port = port; /* Check to see if this is a duplicate. */ peer = sctp_assoc_lookup_paddr(asoc, addr); if (peer) { /* An UNKNOWN state is only set on transports added by * user in sctp_connectx() call. Such transports should be * considered CONFIRMED per RFC 4960, Section 5.4. */ if (peer->state == SCTP_UNKNOWN) { peer->state = SCTP_ACTIVE; } return peer; } peer = sctp_transport_new(asoc->base.net, addr, gfp); if (!peer) return NULL; sctp_transport_set_owner(peer, asoc); /* Initialize the peer's heartbeat interval based on the * association configured value. */ peer->hbinterval = asoc->hbinterval; peer->probe_interval = asoc->probe_interval; peer->encap_port = asoc->encap_port; /* Set the path max_retrans. */ peer->pathmaxrxt = asoc->pathmaxrxt; /* And the partial failure retrans threshold */ peer->pf_retrans = asoc->pf_retrans; /* And the primary path switchover retrans threshold */ peer->ps_retrans = asoc->ps_retrans; /* Initialize the peer's SACK delay timeout based on the * association configured value. */ peer->sackdelay = asoc->sackdelay; peer->sackfreq = asoc->sackfreq; if (addr->sa.sa_family == AF_INET6) { __be32 info = addr->v6.sin6_flowinfo; if (info) { peer->flowlabel = ntohl(info & IPV6_FLOWLABEL_MASK); peer->flowlabel |= SCTP_FLOWLABEL_SET_MASK; } else { peer->flowlabel = asoc->flowlabel; } } peer->dscp = asoc->dscp; /* Enable/disable heartbeat, SACK delay, and path MTU discovery * based on association setting. */ peer->param_flags = asoc->param_flags; /* Initialize the pmtu of the transport. */ sctp_transport_route(peer, NULL, sp); /* If this is the first transport addr on this association, * initialize the association PMTU to the peer's PMTU. * If not and the current association PMTU is higher than the new * peer's PMTU, reset the association PMTU to the new peer's PMTU. */ sctp_assoc_set_pmtu(asoc, asoc->pathmtu ? min_t(int, peer->pathmtu, asoc->pathmtu) : peer->pathmtu); peer->pmtu_pending = 0; /* The asoc->peer.port might not be meaningful yet, but * initialize the packet structure anyway. */ sctp_packet_init(&peer->packet, peer, asoc->base.bind_addr.port, asoc->peer.port); /* 7.2.1 Slow-Start * * o The initial cwnd before DATA transmission or after a sufficiently * long idle period MUST be set to * min(4*MTU, max(2*MTU, 4380 bytes)) * * o The initial value of ssthresh MAY be arbitrarily high * (for example, implementations MAY use the size of the * receiver advertised window). */ peer->cwnd = min(4*asoc->pathmtu, max_t(__u32, 2*asoc->pathmtu, 4380)); /* At this point, we may not have the receiver's advertised window, * so initialize ssthresh to the default value and it will be set * later when we process the INIT. */ peer->ssthresh = SCTP_DEFAULT_MAXWINDOW; peer->partial_bytes_acked = 0; peer->flight_size = 0; peer->burst_limited = 0; /* Set the transport's RTO.initial value */ peer->rto = asoc->rto_initial; sctp_max_rto(asoc, peer); /* Set the peer's active state. */ peer->state = peer_state; /* Add this peer into the transport hashtable */ if (sctp_hash_transport(peer)) { sctp_transport_free(peer); return NULL; } sctp_transport_pl_reset(peer); /* Attach the remote transport to our asoc. */ list_add_tail_rcu(&peer->transports, &asoc->peer.transport_addr_list); asoc->peer.transport_count++; sctp_ulpevent_notify_peer_addr_change(peer, SCTP_ADDR_ADDED, 0); /* If we do not yet have a primary path, set one. */ if (!asoc->peer.primary_path) { sctp_assoc_set_primary(asoc, peer); asoc->peer.retran_path = peer; } if (asoc->peer.active_path == asoc->peer.retran_path && peer->state != SCTP_UNCONFIRMED) { asoc->peer.retran_path = peer; } return peer; } /* Lookup a transport by address. */ struct sctp_transport *sctp_assoc_lookup_paddr( const struct sctp_association *asoc, const union sctp_addr *address) { struct sctp_transport *t; /* Cycle through all transports searching for a peer address. */ list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) { if (sctp_cmp_addr_exact(address, &t->ipaddr)) return t; } return NULL; } /* Remove all transports except a give one */ void sctp_assoc_del_nonprimary_peers(struct sctp_association *asoc, struct sctp_transport *primary) { struct sctp_transport *temp; struct sctp_transport *t; list_for_each_entry_safe(t, temp, &asoc->peer.transport_addr_list, transports) { /* if the current transport is not the primary one, delete it */ if (t != primary) sctp_assoc_rm_peer(asoc, t); } } /* Engage in transport control operations. * Mark the transport up or down and send a notification to the user. * Select and update the new active and retran paths. */ void sctp_assoc_control_transport(struct sctp_association *asoc, struct sctp_transport *transport, enum sctp_transport_cmd command, sctp_sn_error_t error) { int spc_state = SCTP_ADDR_AVAILABLE; bool ulp_notify = true; /* Record the transition on the transport. */ switch (command) { case SCTP_TRANSPORT_UP: /* If we are moving from UNCONFIRMED state due * to heartbeat success, report the SCTP_ADDR_CONFIRMED * state to the user, otherwise report SCTP_ADDR_AVAILABLE. */ if (transport->state == SCTP_PF && asoc->pf_expose != SCTP_PF_EXPOSE_ENABLE) ulp_notify = false; else if (transport->state == SCTP_UNCONFIRMED && error == SCTP_HEARTBEAT_SUCCESS) spc_state = SCTP_ADDR_CONFIRMED; transport->state = SCTP_ACTIVE; sctp_transport_pl_reset(transport); break; case SCTP_TRANSPORT_DOWN: /* If the transport was never confirmed, do not transition it * to inactive state. Also, release the cached route since * there may be a better route next time. */ if (transport->state != SCTP_UNCONFIRMED) { transport->state = SCTP_INACTIVE; sctp_transport_pl_reset(transport); spc_state = SCTP_ADDR_UNREACHABLE; } else { sctp_transport_dst_release(transport); ulp_notify = false; } break; case SCTP_TRANSPORT_PF: transport->state = SCTP_PF; if (asoc->pf_expose != SCTP_PF_EXPOSE_ENABLE) ulp_notify = false; else spc_state = SCTP_ADDR_POTENTIALLY_FAILED; break; default: return; } /* Generate and send a SCTP_PEER_ADDR_CHANGE notification * to the user. */ if (ulp_notify) sctp_ulpevent_notify_peer_addr_change(transport, spc_state, error); /* Select new active and retran paths. */ sctp_select_active_and_retran_path(asoc); } /* Hold a reference to an association. */ void sctp_association_hold(struct sctp_association *asoc) { refcount_inc(&asoc->base.refcnt); } /* Release a reference to an association and cleanup * if there are no more references. */ void sctp_association_put(struct sctp_association *asoc) { if (refcount_dec_and_test(&asoc->base.refcnt)) sctp_association_destroy(asoc); } /* Allocate the next TSN, Transmission Sequence Number, for the given * association. */ __u32 sctp_association_get_next_tsn(struct sctp_association *asoc) { /* From Section 1.6 Serial Number Arithmetic: * Transmission Sequence Numbers wrap around when they reach * 2**32 - 1. That is, the next TSN a DATA chunk MUST use * after transmitting TSN = 2*32 - 1 is TSN = 0. */ __u32 retval = asoc->next_tsn; asoc->next_tsn++; asoc->unack_data++; return retval; } /* Compare two addresses to see if they match. Wildcard addresses * only match themselves. */ int sctp_cmp_addr_exact(const union sctp_addr *ss1, const union sctp_addr *ss2) { struct sctp_af *af; af = sctp_get_af_specific(ss1->sa.sa_family); if (unlikely(!af)) return 0; return af->cmp_addr(ss1, ss2); } /* Return an ecne chunk to get prepended to a packet. * Note: We are sly and return a shared, prealloced chunk. FIXME: * No we don't, but we could/should. */ struct sctp_chunk *sctp_get_ecne_prepend(struct sctp_association *asoc) { if (!asoc->need_ecne) return NULL; /* Send ECNE if needed. * Not being able to allocate a chunk here is not deadly. */ return sctp_make_ecne(asoc, asoc->last_ecne_tsn); } /* * Find which transport this TSN was sent on. */ struct sctp_transport *sctp_assoc_lookup_tsn(struct sctp_association *asoc, __u32 tsn) { struct sctp_transport *active; struct sctp_transport *match; struct sctp_transport *transport; struct sctp_chunk *chunk; __be32 key = htonl(tsn); match = NULL; /* * FIXME: In general, find a more efficient data structure for * searching. */ /* * The general strategy is to search each transport's transmitted * list. Return which transport this TSN lives on. * * Let's be hopeful and check the active_path first. * Another optimization would be to know if there is only one * outbound path and not have to look for the TSN at all. * */ active = asoc->peer.active_path; list_for_each_entry(chunk, &active->transmitted, transmitted_list) { if (key == chunk->subh.data_hdr->tsn) { match = active; goto out; } } /* If not found, go search all the other transports. */ list_for_each_entry(transport, &asoc->peer.transport_addr_list, transports) { if (transport == active) continue; list_for_each_entry(chunk, &transport->transmitted, transmitted_list) { if (key == chunk->subh.data_hdr->tsn) { match = transport; goto out; } } } out: return match; } /* Do delayed input processing. This is scheduled by sctp_rcv(). */ static void sctp_assoc_bh_rcv(struct work_struct *work) { struct sctp_association *asoc = container_of(work, struct sctp_association, base.inqueue.immediate); struct net *net = asoc->base.net; union sctp_subtype subtype; struct sctp_endpoint *ep; struct sctp_chunk *chunk; struct sctp_inq *inqueue; int first_time = 1; /* is this the first time through the loop */ int error = 0; int state; /* The association should be held so we should be safe. */ ep = asoc->ep; inqueue = &asoc->base.inqueue; sctp_association_hold(asoc); while (NULL != (chunk = sctp_inq_pop(inqueue))) { state = asoc->state; subtype = SCTP_ST_CHUNK(chunk->chunk_hdr->type); /* If the first chunk in the packet is AUTH, do special * processing specified in Section 6.3 of SCTP-AUTH spec */ if (first_time && subtype.chunk == SCTP_CID_AUTH) { struct sctp_chunkhdr *next_hdr; next_hdr = sctp_inq_peek(inqueue); if (!next_hdr) goto normal; /* If the next chunk is COOKIE-ECHO, skip the AUTH * chunk while saving a pointer to it so we can do * Authentication later (during cookie-echo * processing). */ if (next_hdr->type == SCTP_CID_COOKIE_ECHO) { chunk->auth_chunk = skb_clone(chunk->skb, GFP_ATOMIC); chunk->auth = 1; continue; } } normal: /* SCTP-AUTH, Section 6.3: * The receiver has a list of chunk types which it expects * to be received only after an AUTH-chunk. This list has * been sent to the peer during the association setup. It * MUST silently discard these chunks if they are not placed * after an AUTH chunk in the packet. */ if (sctp_auth_recv_cid(subtype.chunk, asoc) && !chunk->auth) continue; /* Remember where the last DATA chunk came from so we * know where to send the SACK. */ if (sctp_chunk_is_data(chunk)) asoc->peer.last_data_from = chunk->transport; else { SCTP_INC_STATS(net, SCTP_MIB_INCTRLCHUNKS); asoc->stats.ictrlchunks++; if (chunk->chunk_hdr->type == SCTP_CID_SACK) asoc->stats.isacks++; } if (chunk->transport) chunk->transport->last_time_heard = ktime_get(); /* Run through the state machine. */ error = sctp_do_sm(net, SCTP_EVENT_T_CHUNK, subtype, state, ep, asoc, chunk, GFP_ATOMIC); /* Check to see if the association is freed in response to * the incoming chunk. If so, get out of the while loop. */ if (asoc->base.dead) break; /* If there is an error on chunk, discard this packet. */ if (error && chunk) chunk->pdiscard = 1; if (first_time) first_time = 0; } sctp_association_put(asoc); } /* This routine moves an association from its old sk to a new sk. */ void sctp_assoc_migrate(struct sctp_association *assoc, struct sock *newsk) { struct sctp_sock *newsp = sctp_sk(newsk); struct sock *oldsk = assoc->base.sk; /* Delete the association from the old endpoint's list of * associations. */ list_del_init(&assoc->asocs); /* Decrement the backlog value for a TCP-style socket. */ if (sctp_style(oldsk, TCP)) sk_acceptq_removed(oldsk); /* Release references to the old endpoint and the sock. */ sctp_endpoint_put(assoc->ep); sock_put(assoc->base.sk); /* Get a reference to the new endpoint. */ assoc->ep = newsp->ep; sctp_endpoint_hold(assoc->ep); /* Get a reference to the new sock. */ assoc->base.sk = newsk; sock_hold(assoc->base.sk); /* Add the association to the new endpoint's list of associations. */ sctp_endpoint_add_asoc(newsp->ep, assoc); } /* Update an association (possibly from unexpected COOKIE-ECHO processing). */ int sctp_assoc_update(struct sctp_association *asoc, struct sctp_association *new) { struct sctp_transport *trans; struct list_head *pos, *temp; /* Copy in new parameters of peer. */ asoc->c = new->c; asoc->peer.rwnd = new->peer.rwnd; asoc->peer.sack_needed = new->peer.sack_needed; asoc->peer.auth_capable = new->peer.auth_capable; asoc->peer.i = new->peer.i; if (!sctp_tsnmap_init(&asoc->peer.tsn_map, SCTP_TSN_MAP_INITIAL, asoc->peer.i.initial_tsn, GFP_ATOMIC)) return -ENOMEM; /* Remove any peer addresses not present in the new association. */ list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) { trans = list_entry(pos, struct sctp_transport, transports); if (!sctp_assoc_lookup_paddr(new, &trans->ipaddr)) { sctp_assoc_rm_peer(asoc, trans); continue; } if (asoc->state >= SCTP_STATE_ESTABLISHED) sctp_transport_reset(trans); } /* If the case is A (association restart), use * initial_tsn as next_tsn. If the case is B, use * current next_tsn in case data sent to peer * has been discarded and needs retransmission. */ if (asoc->state >= SCTP_STATE_ESTABLISHED) { asoc->next_tsn = new->next_tsn; asoc->ctsn_ack_point = new->ctsn_ack_point; asoc->adv_peer_ack_point = new->adv_peer_ack_point; /* Reinitialize SSN for both local streams * and peer's streams. */ sctp_stream_clear(&asoc->stream); /* Flush the ULP reassembly and ordered queue. * Any data there will now be stale and will * cause problems. */ sctp_ulpq_flush(&asoc->ulpq); /* reset the overall association error count so * that the restarted association doesn't get torn * down on the next retransmission timer. */ asoc->overall_error_count = 0; } else { /* Add any peer addresses from the new association. */ list_for_each_entry(trans, &new->peer.transport_addr_list, transports) if (!sctp_assoc_add_peer(asoc, &trans->ipaddr, GFP_ATOMIC, trans->state)) return -ENOMEM; asoc->ctsn_ack_point = asoc->next_tsn - 1; asoc->adv_peer_ack_point = asoc->ctsn_ack_point; if (sctp_state(asoc, COOKIE_WAIT)) sctp_stream_update(&asoc->stream, &new->stream); /* get a new assoc id if we don't have one yet. */ if (sctp_assoc_set_id(asoc, GFP_ATOMIC)) return -ENOMEM; } /* SCTP-AUTH: Save the peer parameters from the new associations * and also move the association shared keys over */ kfree(asoc->peer.peer_random); asoc->peer.peer_random = new->peer.peer_random; new->peer.peer_random = NULL; kfree(asoc->peer.peer_chunks); asoc->peer.peer_chunks = new->peer.peer_chunks; new->peer.peer_chunks = NULL; kfree(asoc->peer.peer_hmacs); asoc->peer.peer_hmacs = new->peer.peer_hmacs; new->peer.peer_hmacs = NULL; return sctp_auth_asoc_init_active_key(asoc, GFP_ATOMIC); } /* Update the retran path for sending a retransmitted packet. * See also RFC4960, 6.4. Multi-Homed SCTP Endpoints: * * When there is outbound data to send and the primary path * becomes inactive (e.g., due to failures), or where the * SCTP user explicitly requests to send data to an * inactive destination transport address, before reporting * an error to its ULP, the SCTP endpoint should try to send * the data to an alternate active destination transport * address if one exists. * * When retransmitting data that timed out, if the endpoint * is multihomed, it should consider each source-destination * address pair in its retransmission selection policy. * When retransmitting timed-out data, the endpoint should * attempt to pick the most divergent source-destination * pair from the original source-destination pair to which * the packet was transmitted. * * Note: Rules for picking the most divergent source-destination * pair are an implementation decision and are not specified * within this document. * * Our basic strategy is to round-robin transports in priorities * according to sctp_trans_score() e.g., if no such * transport with state SCTP_ACTIVE exists, round-robin through * SCTP_UNKNOWN, etc. You get the picture. */ static u8 sctp_trans_score(const struct sctp_transport *trans) { switch (trans->state) { case SCTP_ACTIVE: return 3; /* best case */ case SCTP_UNKNOWN: return 2; case SCTP_PF: return 1; default: /* case SCTP_INACTIVE */ return 0; /* worst case */ } } static struct sctp_transport *sctp_trans_elect_tie(struct sctp_transport *trans1, struct sctp_transport *trans2) { if (trans1->error_count > trans2->error_count) { return trans2; } else if (trans1->error_count == trans2->error_count && ktime_after(trans2->last_time_heard, trans1->last_time_heard)) { return trans2; } else { return trans1; } } static struct sctp_transport *sctp_trans_elect_best(struct sctp_transport *curr, struct sctp_transport *best) { u8 score_curr, score_best; if (best == NULL || curr == best) return curr; score_curr = sctp_trans_score(curr); score_best = sctp_trans_score(best); /* First, try a score-based selection if both transport states * differ. If we're in a tie, lets try to make a more clever * decision here based on error counts and last time heard. */ if (score_curr > score_best) return curr; else if (score_curr == score_best) return sctp_trans_elect_tie(best, curr); else return best; } void sctp_assoc_update_retran_path(struct sctp_association *asoc) { struct sctp_transport *trans = asoc->peer.retran_path; struct sctp_transport *trans_next = NULL; /* We're done as we only have the one and only path. */ if (asoc->peer.transport_count == 1) return; /* If active_path and retran_path are the same and active, * then this is the only active path. Use it. */ if (asoc->peer.active_path == asoc->peer.retran_path && asoc->peer.active_path->state == SCTP_ACTIVE) return; /* Iterate from retran_path's successor back to retran_path. */ for (trans = list_next_entry(trans, transports); 1; trans = list_next_entry(trans, transports)) { /* Manually skip the head element. */ if (&trans->transports == &asoc->peer.transport_addr_list) continue; if (trans->state == SCTP_UNCONFIRMED) continue; trans_next = sctp_trans_elect_best(trans, trans_next); /* Active is good enough for immediate return. */ if (trans_next->state == SCTP_ACTIVE) break; /* We've reached the end, time to update path. */ if (trans == asoc->peer.retran_path) break; } asoc->peer.retran_path = trans_next; pr_debug("%s: association:%p updated new path to addr:%pISpc\n", __func__, asoc, &asoc->peer.retran_path->ipaddr.sa); } static void sctp_select_active_and_retran_path(struct sctp_association *asoc) { struct sctp_transport *trans, *trans_pri = NULL, *trans_sec = NULL; struct sctp_transport *trans_pf = NULL; /* Look for the two most recently used active transports. */ list_for_each_entry(trans, &asoc->peer.transport_addr_list, transports) { /* Skip uninteresting transports. */ if (trans->state == SCTP_INACTIVE || trans->state == SCTP_UNCONFIRMED) continue; /* Keep track of the best PF transport from our * list in case we don't find an active one. */ if (trans->state == SCTP_PF) { trans_pf = sctp_trans_elect_best(trans, trans_pf); continue; } /* For active transports, pick the most recent ones. */ if (trans_pri == NULL || ktime_after(trans->last_time_heard, trans_pri->last_time_heard)) { trans_sec = trans_pri; trans_pri = trans; } else if (trans_sec == NULL || ktime_after(trans->last_time_heard, trans_sec->last_time_heard)) { trans_sec = trans; } } /* RFC 2960 6.4 Multi-Homed SCTP Endpoints * * By default, an endpoint should always transmit to the primary * path, unless the SCTP user explicitly specifies the * destination transport address (and possibly source transport * address) to use. [If the primary is active but not most recent, * bump the most recently used transport.] */ if ((asoc->peer.primary_path->state == SCTP_ACTIVE || asoc->peer.primary_path->state == SCTP_UNKNOWN) && asoc->peer.primary_path != trans_pri) { trans_sec = trans_pri; trans_pri = asoc->peer.primary_path; } /* We did not find anything useful for a possible retransmission * path; either primary path that we found is the same as * the current one, or we didn't generally find an active one. */ if (trans_sec == NULL) trans_sec = trans_pri; /* If we failed to find a usable transport, just camp on the * active or pick a PF iff it's the better choice. */ if (trans_pri == NULL) { trans_pri = sctp_trans_elect_best(asoc->peer.active_path, trans_pf); trans_sec = trans_pri; } /* Set the active and retran transports. */ asoc->peer.active_path = trans_pri; asoc->peer.retran_path = trans_sec; } struct sctp_transport * sctp_assoc_choose_alter_transport(struct sctp_association *asoc, struct sctp_transport *last_sent_to) { /* If this is the first time packet is sent, use the active path, * else use the retran path. If the last packet was sent over the * retran path, update the retran path and use it. */ if (last_sent_to == NULL) { return asoc->peer.active_path; } else { if (last_sent_to == asoc->peer.retran_path) sctp_assoc_update_retran_path(asoc); return asoc->peer.retran_path; } } void sctp_assoc_update_frag_point(struct sctp_association *asoc) { int frag = sctp_mtu_payload(sctp_sk(asoc->base.sk), asoc->pathmtu, sctp_datachk_len(&asoc->stream)); if (asoc->user_frag) frag = min_t(int, frag, asoc->user_frag); frag = min_t(int, frag, SCTP_MAX_CHUNK_LEN - sctp_datachk_len(&asoc->stream)); asoc->frag_point = SCTP_TRUNC4(frag); } void sctp_assoc_set_pmtu(struct sctp_association *asoc, __u32 pmtu) { if (asoc->pathmtu != pmtu) { asoc->pathmtu = pmtu; sctp_assoc_update_frag_point(asoc); } pr_debug("%s: asoc:%p, pmtu:%d, frag_point:%d\n", __func__, asoc, asoc->pathmtu, asoc->frag_point); } /* Update the association's pmtu and frag_point by going through all the * transports. This routine is called when a transport's PMTU has changed. */ void sctp_assoc_sync_pmtu(struct sctp_association *asoc) { struct sctp_transport *t; __u32 pmtu = 0; if (!asoc) return; /* Get the lowest pmtu of all the transports. */ list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) { if (t->pmtu_pending && t->dst) { sctp_transport_update_pmtu(t, atomic_read(&t->mtu_info)); t->pmtu_pending = 0; } if (!pmtu || (t->pathmtu < pmtu)) pmtu = t->pathmtu; } sctp_assoc_set_pmtu(asoc, pmtu); } /* Should we send a SACK to update our peer? */ static inline bool sctp_peer_needs_update(struct sctp_association *asoc) { struct net *net = asoc->base.net; switch (asoc->state) { case SCTP_STATE_ESTABLISHED: case SCTP_STATE_SHUTDOWN_PENDING: case SCTP_STATE_SHUTDOWN_RECEIVED: case SCTP_STATE_SHUTDOWN_SENT: if ((asoc->rwnd > asoc->a_rwnd) && ((asoc->rwnd - asoc->a_rwnd) >= max_t(__u32, (asoc->base.sk->sk_rcvbuf >> net->sctp.rwnd_upd_shift), asoc->pathmtu))) return true; break; default: break; } return false; } /* Increase asoc's rwnd by len and send any window update SACK if needed. */ void sctp_assoc_rwnd_increase(struct sctp_association *asoc, unsigned int len) { struct sctp_chunk *sack; struct timer_list *timer; if (asoc->rwnd_over) { if (asoc->rwnd_over >= len) { asoc->rwnd_over -= len; } else { asoc->rwnd += (len - asoc->rwnd_over); asoc->rwnd_over = 0; } } else { asoc->rwnd += len; } /* If we had window pressure, start recovering it * once our rwnd had reached the accumulated pressure * threshold. The idea is to recover slowly, but up * to the initial advertised window. */ if (asoc->rwnd_press) { int change = min(asoc->pathmtu, asoc->rwnd_press); asoc->rwnd += change; asoc->rwnd_press -= change; } pr_debug("%s: asoc:%p rwnd increased by %d to (%u, %u) - %u\n", __func__, asoc, len, asoc->rwnd, asoc->rwnd_over, asoc->a_rwnd); /* Send a window update SACK if the rwnd has increased by at least the * minimum of the association's PMTU and half of the receive buffer. * The algorithm used is similar to the one described in * Section 4.2.3.3 of RFC 1122. */ if (sctp_peer_needs_update(asoc)) { asoc->a_rwnd = asoc->rwnd; pr_debug("%s: sending window update SACK- asoc:%p rwnd:%u " "a_rwnd:%u\n", __func__, asoc, asoc->rwnd, asoc->a_rwnd); sack = sctp_make_sack(asoc); if (!sack) return; asoc->peer.sack_needed = 0; sctp_outq_tail(&asoc->outqueue, sack, GFP_ATOMIC); /* Stop the SACK timer. */ timer = &asoc->timers[SCTP_EVENT_TIMEOUT_SACK]; if (timer_delete(timer)) sctp_association_put(asoc); } } /* Decrease asoc's rwnd by len. */ void sctp_assoc_rwnd_decrease(struct sctp_association *asoc, unsigned int len) { int rx_count; int over = 0; if (unlikely(!asoc->rwnd || asoc->rwnd_over)) pr_debug("%s: association:%p has asoc->rwnd:%u, " "asoc->rwnd_over:%u!\n", __func__, asoc, asoc->rwnd, asoc->rwnd_over); if (asoc->ep->rcvbuf_policy) rx_count = atomic_read(&asoc->rmem_alloc); else rx_count = atomic_read(&asoc->base.sk->sk_rmem_alloc); /* If we've reached or overflowed our receive buffer, announce * a 0 rwnd if rwnd would still be positive. Store the * potential pressure overflow so that the window can be restored * back to original value. */ if (rx_count >= asoc->base.sk->sk_rcvbuf) over = 1; if (asoc->rwnd >= len) { asoc->rwnd -= len; if (over) { asoc->rwnd_press += asoc->rwnd; asoc->rwnd = 0; } } else { asoc->rwnd_over += len - asoc->rwnd; asoc->rwnd = 0; } pr_debug("%s: asoc:%p rwnd decreased by %d to (%u, %u, %u)\n", __func__, asoc, len, asoc->rwnd, asoc->rwnd_over, asoc->rwnd_press); } /* Build the bind address list for the association based on info from the * local endpoint and the remote peer. */ int sctp_assoc_set_bind_addr_from_ep(struct sctp_association *asoc, enum sctp_scope scope, gfp_t gfp) { struct sock *sk = asoc->base.sk; int flags; /* Use scoping rules to determine the subset of addresses from * the endpoint. */ flags = (PF_INET6 == sk->sk_family) ? SCTP_ADDR6_ALLOWED : 0; if (!inet_v6_ipv6only(sk)) flags |= SCTP_ADDR4_ALLOWED; if (asoc->peer.ipv4_address) flags |= SCTP_ADDR4_PEERSUPP; if (asoc->peer.ipv6_address) flags |= SCTP_ADDR6_PEERSUPP; return sctp_bind_addr_copy(asoc->base.net, &asoc->base.bind_addr, &asoc->ep->base.bind_addr, scope, gfp, flags); } /* Build the association's bind address list from the cookie. */ int sctp_assoc_set_bind_addr_from_cookie(struct sctp_association *asoc, struct sctp_cookie *cookie, gfp_t gfp) { struct sctp_init_chunk *peer_init = (struct sctp_init_chunk *)(cookie + 1); int var_size2 = ntohs(peer_init->chunk_hdr.length); int var_size3 = cookie->raw_addr_list_len; __u8 *raw = (__u8 *)peer_init + var_size2; return sctp_raw_to_bind_addrs(&asoc->base.bind_addr, raw, var_size3, asoc->ep->base.bind_addr.port, gfp); } /* Lookup laddr in the bind address list of an association. */ int sctp_assoc_lookup_laddr(struct sctp_association *asoc, const union sctp_addr *laddr) { int found = 0; if ((asoc->base.bind_addr.port == ntohs(laddr->v4.sin_port)) && sctp_bind_addr_match(&asoc->base.bind_addr, laddr, sctp_sk(asoc->base.sk))) found = 1; return found; } /* Set an association id for a given association */ int sctp_assoc_set_id(struct sctp_association *asoc, gfp_t gfp) { bool preload = gfpflags_allow_blocking(gfp); int ret; /* If the id is already assigned, keep it. */ if (asoc->assoc_id) return 0; if (preload) idr_preload(gfp); spin_lock_bh(&sctp_assocs_id_lock); /* 0, 1, 2 are used as SCTP_FUTURE_ASSOC, SCTP_CURRENT_ASSOC and * SCTP_ALL_ASSOC, so an available id must be > SCTP_ALL_ASSOC. */ ret = idr_alloc_cyclic(&sctp_assocs_id, asoc, SCTP_ALL_ASSOC + 1, 0, GFP_NOWAIT); spin_unlock_bh(&sctp_assocs_id_lock); if (preload) idr_preload_end(); if (ret < 0) return ret; asoc->assoc_id = (sctp_assoc_t)ret; return 0; } /* Free the ASCONF queue */ static void sctp_assoc_free_asconf_queue(struct sctp_association *asoc) { struct sctp_chunk *asconf; struct sctp_chunk *tmp; list_for_each_entry_safe(asconf, tmp, &asoc->addip_chunk_list, list) { list_del_init(&asconf->list); sctp_chunk_free(asconf); } } /* Free asconf_ack cache */ static void sctp_assoc_free_asconf_acks(struct sctp_association *asoc) { struct sctp_chunk *ack; struct sctp_chunk *tmp; list_for_each_entry_safe(ack, tmp, &asoc->asconf_ack_list, transmitted_list) { list_del_init(&ack->transmitted_list); sctp_chunk_free(ack); } } /* Clean up the ASCONF_ACK queue */ void sctp_assoc_clean_asconf_ack_cache(const struct sctp_association *asoc) { struct sctp_chunk *ack; struct sctp_chunk *tmp; /* We can remove all the entries from the queue up to * the "Peer-Sequence-Number". */ list_for_each_entry_safe(ack, tmp, &asoc->asconf_ack_list, transmitted_list) { if (ack->subh.addip_hdr->serial == htonl(asoc->peer.addip_serial)) break; list_del_init(&ack->transmitted_list); sctp_chunk_free(ack); } } /* Find the ASCONF_ACK whose serial number matches ASCONF */ struct sctp_chunk *sctp_assoc_lookup_asconf_ack( const struct sctp_association *asoc, __be32 serial) { struct sctp_chunk *ack; /* Walk through the list of cached ASCONF-ACKs and find the * ack chunk whose serial number matches that of the request. */ list_for_each_entry(ack, &asoc->asconf_ack_list, transmitted_list) { if (sctp_chunk_pending(ack)) continue; if (ack->subh.addip_hdr->serial == serial) { sctp_chunk_hold(ack); return ack; } } return NULL; } void sctp_asconf_queue_teardown(struct sctp_association *asoc) { /* Free any cached ASCONF_ACK chunk. */ sctp_assoc_free_asconf_acks(asoc); /* Free the ASCONF queue. */ sctp_assoc_free_asconf_queue(asoc); /* Free any cached ASCONF chunk. */ if (asoc->addip_last_asconf) sctp_chunk_free(asoc->addip_last_asconf); }
4 4 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2016 Tomasz Chilinski <tomasz.chilinski@chilan.com> */ /* Kernel module implementing an IP set type: the hash:ip,mac type */ #include <linux/jhash.h> #include <linux/module.h> #include <linux/ip.h> #include <linux/etherdevice.h> #include <linux/skbuff.h> #include <linux/errno.h> #include <linux/random.h> #include <linux/if_ether.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/netlink.h> #include <net/tcp.h> #include <linux/netfilter.h> #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> #include <linux/netfilter/ipset/ip_set_hash.h> #define IPSET_TYPE_REV_MIN 0 #define IPSET_TYPE_REV_MAX 1 /* bucketsize, initval support */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Tomasz Chilinski <tomasz.chilinski@chilan.com>"); IP_SET_MODULE_DESC("hash:ip,mac", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:ip,mac"); /* Type specific function prefix */ #define HTYPE hash_ipmac /* IPv4 variant */ /* Member elements */ struct hash_ipmac4_elem { /* Zero valued IP addresses cannot be stored */ __be32 ip; union { unsigned char ether[ETH_ALEN]; __be32 foo[2]; }; }; /* Common functions */ static bool hash_ipmac4_data_equal(const struct hash_ipmac4_elem *e1, const struct hash_ipmac4_elem *e2, u32 *multi) { return e1->ip == e2->ip && ether_addr_equal(e1->ether, e2->ether); } static bool hash_ipmac4_data_list(struct sk_buff *skb, const struct hash_ipmac4_elem *e) { if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, e->ip) || nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, e->ether)) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_ipmac4_data_next(struct hash_ipmac4_elem *next, const struct hash_ipmac4_elem *e) { next->ip = e->ip; } #define MTYPE hash_ipmac4 #define PF 4 #define HOST_MASK 32 #define HKEY_DATALEN sizeof(struct hash_ipmac4_elem) #include "ip_set_hash_gen.h" static int hash_ipmac4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipmac4_elem e = { .ip = 0, { .foo[0] = 0, .foo[1] = 0 } }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); if (skb_mac_header(skb) < skb->head || (skb_mac_header(skb) + ETH_HLEN) > skb->data) return -EINVAL; if (opt->flags & IPSET_DIM_TWO_SRC) ether_addr_copy(e.ether, eth_hdr(skb)->h_source); else ether_addr_copy(e.ether, eth_hdr(skb)->h_dest); if (is_zero_ether_addr(e.ether)) return -EINVAL; ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipmac4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipmac4_elem e = { .ip = 0, { .foo[0] = 0, .foo[1] = 0 } }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); int ret; if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_ETHER] || nla_len(tb[IPSET_ATTR_ETHER]) != ETH_ALEN || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip) || ip_set_get_extensions(set, tb, &ext); if (ret) return ret; memcpy(e.ether, nla_data(tb[IPSET_ATTR_ETHER]), ETH_ALEN); if (is_zero_ether_addr(e.ether)) return -IPSET_ERR_HASH_ELEM; return adtfn(set, &e, &ext, &ext, flags); } /* IPv6 variant */ /* Member elements */ struct hash_ipmac6_elem { /* Zero valued IP addresses cannot be stored */ union nf_inet_addr ip; union { unsigned char ether[ETH_ALEN]; __be32 foo[2]; }; }; /* Common functions */ static bool hash_ipmac6_data_equal(const struct hash_ipmac6_elem *e1, const struct hash_ipmac6_elem *e2, u32 *multi) { return ipv6_addr_equal(&e1->ip.in6, &e2->ip.in6) && ether_addr_equal(e1->ether, e2->ether); } static bool hash_ipmac6_data_list(struct sk_buff *skb, const struct hash_ipmac6_elem *e) { if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &e->ip.in6) || nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, e->ether)) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_ipmac6_data_next(struct hash_ipmac6_elem *next, const struct hash_ipmac6_elem *e) { } #undef MTYPE #undef PF #undef HOST_MASK #undef HKEY_DATALEN #define MTYPE hash_ipmac6 #define PF 6 #define HOST_MASK 128 #define HKEY_DATALEN sizeof(struct hash_ipmac6_elem) #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" static int hash_ipmac6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipmac6_elem e = { { .all = { 0 } }, { .foo[0] = 0, .foo[1] = 0 } }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); if (skb_mac_header(skb) < skb->head || (skb_mac_header(skb) + ETH_HLEN) > skb->data) return -EINVAL; if (opt->flags & IPSET_DIM_TWO_SRC) ether_addr_copy(e.ether, eth_hdr(skb)->h_source); else ether_addr_copy(e.ether, eth_hdr(skb)->h_dest); if (is_zero_ether_addr(e.ether)) return -EINVAL; ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipmac6_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipmac6_elem e = { { .all = { 0 } }, { .foo[0] = 0, .foo[1] = 0 } }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); int ret; if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_ETHER] || nla_len(tb[IPSET_ATTR_ETHER]) != ETH_ALEN || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || ip_set_get_extensions(set, tb, &ext); if (ret) return ret; memcpy(e.ether, nla_data(tb[IPSET_ATTR_ETHER]), ETH_ALEN); if (is_zero_ether_addr(e.ether)) return -IPSET_ERR_HASH_ELEM; return adtfn(set, &e, &ext, &ext, flags); } static struct ip_set_type hash_ipmac_type __read_mostly = { .name = "hash:ip,mac", .protocol = IPSET_PROTOCOL, .features = IPSET_TYPE_IP | IPSET_TYPE_MAC, .dimension = IPSET_DIM_TWO, .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_ipmac_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, [IPSET_ATTR_ETHER] = { .type = NLA_BINARY, .len = ETH_ALEN }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; static int __init hash_ipmac_init(void) { return ip_set_type_register(&hash_ipmac_type); } static void __exit hash_ipmac_fini(void) { ip_set_type_unregister(&hash_ipmac_type); } module_init(hash_ipmac_init); module_exit(hash_ipmac_fini);
648 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 /* SPDX-License-Identifier: GPL-2.0 */ /* * Because linux/module.h has tracepoints in the header, and ftrace.h * used to include this file, define_trace.h includes linux/module.h * But we do not want the module.h to override the TRACE_SYSTEM macro * variable that define_trace.h is processing, so we only set it * when module events are being processed, which would happen when * CREATE_TRACE_POINTS is defined. */ #ifdef CREATE_TRACE_POINTS #undef TRACE_SYSTEM #define TRACE_SYSTEM module #endif #if !defined(_TRACE_MODULE_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_MODULE_H #include <linux/tracepoint.h> #ifdef CONFIG_MODULES struct module; #define show_module_flags(flags) __print_flags(flags, "", \ { (1UL << TAINT_PROPRIETARY_MODULE), "P" }, \ { (1UL << TAINT_OOT_MODULE), "O" }, \ { (1UL << TAINT_FORCED_MODULE), "F" }, \ { (1UL << TAINT_CRAP), "C" }, \ { (1UL << TAINT_UNSIGNED_MODULE), "E" }) TRACE_EVENT(module_load, TP_PROTO(struct module *mod), TP_ARGS(mod), TP_STRUCT__entry( __field( unsigned int, taints ) __string( name, mod->name ) ), TP_fast_assign( __entry->taints = mod->taints; __assign_str(name); ), TP_printk("%s %s", __get_str(name), show_module_flags(__entry->taints)) ); TRACE_EVENT(module_free, TP_PROTO(struct module *mod), TP_ARGS(mod), TP_STRUCT__entry( __string( name, mod->name ) ), TP_fast_assign( __assign_str(name); ), TP_printk("%s", __get_str(name)) ); #ifdef CONFIG_MODULE_UNLOAD /* trace_module_get/put are only used if CONFIG_MODULE_UNLOAD is defined */ DECLARE_EVENT_CLASS(module_refcnt, TP_PROTO(struct module *mod, unsigned long ip), TP_ARGS(mod, ip), TP_STRUCT__entry( __field( unsigned long, ip ) __field( int, refcnt ) __string( name, mod->name ) ), TP_fast_assign( __entry->ip = ip; __entry->refcnt = atomic_read(&mod->refcnt); __assign_str(name); ), TP_printk("%s call_site=%ps refcnt=%d", __get_str(name), (void *)__entry->ip, __entry->refcnt) ); DEFINE_EVENT(module_refcnt, module_get, TP_PROTO(struct module *mod, unsigned long ip), TP_ARGS(mod, ip) ); DEFINE_EVENT(module_refcnt, module_put, TP_PROTO(struct module *mod, unsigned long ip), TP_ARGS(mod, ip) ); #endif /* CONFIG_MODULE_UNLOAD */ TRACE_EVENT(module_request, TP_PROTO(char *name, bool wait, unsigned long ip), TP_ARGS(name, wait, ip), TP_STRUCT__entry( __field( unsigned long, ip ) __field( bool, wait ) __string( name, name ) ), TP_fast_assign( __entry->ip = ip; __entry->wait = wait; __assign_str(name); ), TP_printk("%s wait=%d call_site=%ps", __get_str(name), (int)__entry->wait, (void *)__entry->ip) ); #endif /* CONFIG_MODULES */ #endif /* _TRACE_MODULE_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
2310 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM bpf_trace #if !defined(_TRACE_BPF_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_BPF_TRACE_H #include <linux/tracepoint.h> TRACE_EVENT(bpf_trace_printk, TP_PROTO(const char *bpf_string), TP_ARGS(bpf_string), TP_STRUCT__entry( __string(bpf_string, bpf_string) ), TP_fast_assign( __assign_str(bpf_string); ), TP_printk("%s", __get_str(bpf_string)) ); #endif /* _TRACE_BPF_TRACE_H */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . #define TRACE_INCLUDE_FILE bpf_trace #include <trace/define_trace.h>
34 34 1 2 1 5 4 1 2 13 13 3 3 6 6 6 4 4 8 4 4 53 2 35 16 7 3 3 1 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> * Copyright (c) 2014 Intel Corporation * Author: Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com> * * Development of this code funded by Astaro AG (http://www.astaro.com/) */ #include <linux/kernel.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/random.h> #include <linux/smp.h> #include <linux/static_key.h> #include <net/dst.h> #include <net/ip.h> #include <net/sock.h> #include <net/tcp_states.h> /* for TCP_TIME_WAIT */ #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nft_meta.h> #include <net/netfilter/nf_tables_offload.h> #include <uapi/linux/netfilter_bridge.h> /* NF_BR_PRE_ROUTING */ #define NFT_META_SECS_PER_MINUTE 60 #define NFT_META_SECS_PER_HOUR 3600 #define NFT_META_SECS_PER_DAY 86400 #define NFT_META_DAYS_PER_WEEK 7 static u8 nft_meta_weekday(void) { time64_t secs = ktime_get_real_seconds(); unsigned int dse; u8 wday; secs -= NFT_META_SECS_PER_MINUTE * sys_tz.tz_minuteswest; dse = div_u64(secs, NFT_META_SECS_PER_DAY); wday = (4 + dse) % NFT_META_DAYS_PER_WEEK; return wday; } static u32 nft_meta_hour(time64_t secs) { struct tm tm; time64_to_tm(secs, 0, &tm); return tm.tm_hour * NFT_META_SECS_PER_HOUR + tm.tm_min * NFT_META_SECS_PER_MINUTE + tm.tm_sec; } static noinline_for_stack void nft_meta_get_eval_time(enum nft_meta_keys key, u32 *dest) { switch (key) { case NFT_META_TIME_NS: nft_reg_store64((u64 *)dest, ktime_get_real_ns()); break; case NFT_META_TIME_DAY: nft_reg_store8(dest, nft_meta_weekday()); break; case NFT_META_TIME_HOUR: *dest = nft_meta_hour(ktime_get_real_seconds()); break; default: break; } } static noinline bool nft_meta_get_eval_pkttype_lo(const struct nft_pktinfo *pkt, u32 *dest) { const struct sk_buff *skb = pkt->skb; switch (nft_pf(pkt)) { case NFPROTO_IPV4: if (ipv4_is_multicast(ip_hdr(skb)->daddr)) nft_reg_store8(dest, PACKET_MULTICAST); else nft_reg_store8(dest, PACKET_BROADCAST); break; case NFPROTO_IPV6: nft_reg_store8(dest, PACKET_MULTICAST); break; case NFPROTO_NETDEV: switch (skb->protocol) { case htons(ETH_P_IP): { int noff = skb_network_offset(skb); struct iphdr *iph, _iph; iph = skb_header_pointer(skb, noff, sizeof(_iph), &_iph); if (!iph) return false; if (ipv4_is_multicast(iph->daddr)) nft_reg_store8(dest, PACKET_MULTICAST); else nft_reg_store8(dest, PACKET_BROADCAST); break; } case htons(ETH_P_IPV6): nft_reg_store8(dest, PACKET_MULTICAST); break; default: WARN_ON_ONCE(1); return false; } break; default: WARN_ON_ONCE(1); return false; } return true; } static noinline bool nft_meta_get_eval_skugid(enum nft_meta_keys key, u32 *dest, const struct nft_pktinfo *pkt) { struct sock *sk = skb_to_full_sk(pkt->skb); struct socket *sock; if (!sk || !sk_fullsock(sk) || !net_eq(nft_net(pkt), sock_net(sk))) return false; read_lock_bh(&sk->sk_callback_lock); sock = sk->sk_socket; if (!sock || !sock->file) { read_unlock_bh(&sk->sk_callback_lock); return false; } switch (key) { case NFT_META_SKUID: *dest = from_kuid_munged(sock_net(sk)->user_ns, sock->file->f_cred->fsuid); break; case NFT_META_SKGID: *dest = from_kgid_munged(sock_net(sk)->user_ns, sock->file->f_cred->fsgid); break; default: break; } read_unlock_bh(&sk->sk_callback_lock); return true; } #ifdef CONFIG_CGROUP_NET_CLASSID static noinline bool nft_meta_get_eval_cgroup(u32 *dest, const struct nft_pktinfo *pkt) { struct sock *sk = skb_to_full_sk(pkt->skb); if (!sk || !sk_fullsock(sk) || !net_eq(nft_net(pkt), sock_net(sk))) return false; *dest = sock_cgroup_classid(&sk->sk_cgrp_data); return true; } #endif static noinline bool nft_meta_get_eval_kind(enum nft_meta_keys key, u32 *dest, const struct nft_pktinfo *pkt) { const struct net_device *in = nft_in(pkt), *out = nft_out(pkt); switch (key) { case NFT_META_IIFKIND: if (!in || !in->rtnl_link_ops) return false; strscpy_pad((char *)dest, in->rtnl_link_ops->kind, IFNAMSIZ); break; case NFT_META_OIFKIND: if (!out || !out->rtnl_link_ops) return false; strscpy_pad((char *)dest, out->rtnl_link_ops->kind, IFNAMSIZ); break; default: return false; } return true; } static void nft_meta_store_ifindex(u32 *dest, const struct net_device *dev) { *dest = dev ? dev->ifindex : 0; } static void nft_meta_store_ifname(u32 *dest, const struct net_device *dev) { strscpy_pad((char *)dest, dev ? dev->name : "", IFNAMSIZ); } static bool nft_meta_store_iftype(u32 *dest, const struct net_device *dev) { if (!dev) return false; nft_reg_store16(dest, dev->type); return true; } static bool nft_meta_store_ifgroup(u32 *dest, const struct net_device *dev) { if (!dev) return false; *dest = dev->group; return true; } static bool nft_meta_get_eval_ifname(enum nft_meta_keys key, u32 *dest, const struct nft_pktinfo *pkt) { switch (key) { case NFT_META_IIFNAME: nft_meta_store_ifname(dest, nft_in(pkt)); break; case NFT_META_OIFNAME: nft_meta_store_ifname(dest, nft_out(pkt)); break; case NFT_META_IIF: nft_meta_store_ifindex(dest, nft_in(pkt)); break; case NFT_META_OIF: nft_meta_store_ifindex(dest, nft_out(pkt)); break; case NFT_META_IFTYPE: if (!nft_meta_store_iftype(dest, pkt->skb->dev)) return false; break; case __NFT_META_IIFTYPE: if (!nft_meta_store_iftype(dest, nft_in(pkt))) return false; break; case NFT_META_OIFTYPE: if (!nft_meta_store_iftype(dest, nft_out(pkt))) return false; break; case NFT_META_IIFGROUP: if (!nft_meta_store_ifgroup(dest, nft_in(pkt))) return false; break; case NFT_META_OIFGROUP: if (!nft_meta_store_ifgroup(dest, nft_out(pkt))) return false; break; default: return false; } return true; } #ifdef CONFIG_IP_ROUTE_CLASSID static noinline bool nft_meta_get_eval_rtclassid(const struct sk_buff *skb, u32 *dest) { const struct dst_entry *dst = skb_dst(skb); if (!dst) return false; *dest = dst->tclassid; return true; } #endif static noinline u32 nft_meta_get_eval_sdif(const struct nft_pktinfo *pkt) { switch (nft_pf(pkt)) { case NFPROTO_IPV4: return inet_sdif(pkt->skb); case NFPROTO_IPV6: return inet6_sdif(pkt->skb); } return 0; } static noinline void nft_meta_get_eval_sdifname(u32 *dest, const struct nft_pktinfo *pkt) { u32 sdif = nft_meta_get_eval_sdif(pkt); const struct net_device *dev; dev = sdif ? dev_get_by_index_rcu(nft_net(pkt), sdif) : NULL; nft_meta_store_ifname(dest, dev); } void nft_meta_get_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_meta *priv = nft_expr_priv(expr); const struct sk_buff *skb = pkt->skb; u32 *dest = &regs->data[priv->dreg]; switch (priv->key) { case NFT_META_LEN: *dest = skb->len; break; case NFT_META_PROTOCOL: nft_reg_store16(dest, (__force u16)skb->protocol); break; case NFT_META_NFPROTO: nft_reg_store8(dest, nft_pf(pkt)); break; case NFT_META_L4PROTO: if (!(pkt->flags & NFT_PKTINFO_L4PROTO)) goto err; nft_reg_store8(dest, pkt->tprot); break; case NFT_META_PRIORITY: *dest = skb->priority; break; case NFT_META_MARK: *dest = skb->mark; break; case NFT_META_IIF: case NFT_META_OIF: case NFT_META_IIFNAME: case NFT_META_OIFNAME: case NFT_META_IIFTYPE: case NFT_META_OIFTYPE: case NFT_META_IIFGROUP: case NFT_META_OIFGROUP: if (!nft_meta_get_eval_ifname(priv->key, dest, pkt)) goto err; break; case NFT_META_SKUID: case NFT_META_SKGID: if (!nft_meta_get_eval_skugid(priv->key, dest, pkt)) goto err; break; #ifdef CONFIG_IP_ROUTE_CLASSID case NFT_META_RTCLASSID: if (!nft_meta_get_eval_rtclassid(skb, dest)) goto err; break; #endif #ifdef CONFIG_NETWORK_SECMARK case NFT_META_SECMARK: *dest = skb->secmark; break; #endif case NFT_META_PKTTYPE: if (skb->pkt_type != PACKET_LOOPBACK) { nft_reg_store8(dest, skb->pkt_type); break; } if (!nft_meta_get_eval_pkttype_lo(pkt, dest)) goto err; break; case NFT_META_CPU: *dest = raw_smp_processor_id(); break; #ifdef CONFIG_CGROUP_NET_CLASSID case NFT_META_CGROUP: if (!nft_meta_get_eval_cgroup(dest, pkt)) goto err; break; #endif case NFT_META_PRANDOM: *dest = get_random_u32(); break; #ifdef CONFIG_XFRM case NFT_META_SECPATH: nft_reg_store8(dest, secpath_exists(skb)); break; #endif case NFT_META_IIFKIND: case NFT_META_OIFKIND: if (!nft_meta_get_eval_kind(priv->key, dest, pkt)) goto err; break; case NFT_META_TIME_NS: case NFT_META_TIME_DAY: case NFT_META_TIME_HOUR: nft_meta_get_eval_time(priv->key, dest); break; case NFT_META_SDIF: *dest = nft_meta_get_eval_sdif(pkt); break; case NFT_META_SDIFNAME: nft_meta_get_eval_sdifname(dest, pkt); break; default: WARN_ON(1); goto err; } return; err: regs->verdict.code = NFT_BREAK; } EXPORT_SYMBOL_GPL(nft_meta_get_eval); void nft_meta_set_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_meta *meta = nft_expr_priv(expr); struct sk_buff *skb = pkt->skb; u32 *sreg = &regs->data[meta->sreg]; u32 value = *sreg; u8 value8; switch (meta->key) { case NFT_META_MARK: skb->mark = value; break; case NFT_META_PRIORITY: skb->priority = value; break; case NFT_META_PKTTYPE: value8 = nft_reg_load8(sreg); if (skb->pkt_type != value8 && skb_pkt_type_ok(value8) && skb_pkt_type_ok(skb->pkt_type)) skb->pkt_type = value8; break; case NFT_META_NFTRACE: value8 = nft_reg_load8(sreg); skb->nf_trace = !!value8; break; #ifdef CONFIG_NETWORK_SECMARK case NFT_META_SECMARK: skb->secmark = value; break; #endif default: WARN_ON(1); } } EXPORT_SYMBOL_GPL(nft_meta_set_eval); const struct nla_policy nft_meta_policy[NFTA_META_MAX + 1] = { [NFTA_META_DREG] = { .type = NLA_U32 }, [NFTA_META_KEY] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_META_SREG] = { .type = NLA_U32 }, }; EXPORT_SYMBOL_GPL(nft_meta_policy); int nft_meta_get_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_meta *priv = nft_expr_priv(expr); unsigned int len; priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY])); switch (priv->key) { case NFT_META_PROTOCOL: case NFT_META_IIFTYPE: case NFT_META_OIFTYPE: len = sizeof(u16); break; case NFT_META_NFPROTO: case NFT_META_L4PROTO: case NFT_META_LEN: case NFT_META_PRIORITY: case NFT_META_MARK: case NFT_META_IIF: case NFT_META_OIF: case NFT_META_SDIF: case NFT_META_SKUID: case NFT_META_SKGID: #ifdef CONFIG_IP_ROUTE_CLASSID case NFT_META_RTCLASSID: #endif #ifdef CONFIG_NETWORK_SECMARK case NFT_META_SECMARK: #endif case NFT_META_PKTTYPE: case NFT_META_CPU: case NFT_META_IIFGROUP: case NFT_META_OIFGROUP: #ifdef CONFIG_CGROUP_NET_CLASSID case NFT_META_CGROUP: #endif len = sizeof(u32); break; case NFT_META_IIFNAME: case NFT_META_OIFNAME: case NFT_META_IIFKIND: case NFT_META_OIFKIND: case NFT_META_SDIFNAME: len = IFNAMSIZ; break; case NFT_META_PRANDOM: len = sizeof(u32); break; #ifdef CONFIG_XFRM case NFT_META_SECPATH: len = sizeof(u8); break; #endif case NFT_META_TIME_NS: len = sizeof(u64); break; case NFT_META_TIME_DAY: len = sizeof(u8); break; case NFT_META_TIME_HOUR: len = sizeof(u32); break; default: return -EOPNOTSUPP; } priv->len = len; return nft_parse_register_store(ctx, tb[NFTA_META_DREG], &priv->dreg, NULL, NFT_DATA_VALUE, len); } EXPORT_SYMBOL_GPL(nft_meta_get_init); static int nft_meta_get_validate_sdif(const struct nft_ctx *ctx) { unsigned int hooks; switch (ctx->family) { case NFPROTO_IPV4: case NFPROTO_IPV6: case NFPROTO_INET: hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD); break; default: return -EOPNOTSUPP; } return nft_chain_validate_hooks(ctx->chain, hooks); } static int nft_meta_get_validate_xfrm(const struct nft_ctx *ctx) { #ifdef CONFIG_XFRM unsigned int hooks; switch (ctx->family) { case NFPROTO_NETDEV: hooks = 1 << NF_NETDEV_INGRESS; break; case NFPROTO_IPV4: case NFPROTO_IPV6: case NFPROTO_INET: hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD); break; default: return -EOPNOTSUPP; } return nft_chain_validate_hooks(ctx->chain, hooks); #else return 0; #endif } static int nft_meta_get_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_meta *priv = nft_expr_priv(expr); switch (priv->key) { case NFT_META_SECPATH: return nft_meta_get_validate_xfrm(ctx); case NFT_META_SDIF: case NFT_META_SDIFNAME: return nft_meta_get_validate_sdif(ctx); default: break; } return 0; } int nft_meta_set_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct nft_meta *priv = nft_expr_priv(expr); unsigned int hooks; if (priv->key != NFT_META_PKTTYPE) return 0; switch (ctx->family) { case NFPROTO_BRIDGE: hooks = 1 << NF_BR_PRE_ROUTING; break; case NFPROTO_NETDEV: hooks = 1 << NF_NETDEV_INGRESS; break; case NFPROTO_IPV4: case NFPROTO_IPV6: case NFPROTO_INET: hooks = 1 << NF_INET_PRE_ROUTING; break; default: return -EOPNOTSUPP; } return nft_chain_validate_hooks(ctx->chain, hooks); } EXPORT_SYMBOL_GPL(nft_meta_set_validate); int nft_meta_set_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_meta *priv = nft_expr_priv(expr); unsigned int len; int err; priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY])); switch (priv->key) { case NFT_META_MARK: case NFT_META_PRIORITY: #ifdef CONFIG_NETWORK_SECMARK case NFT_META_SECMARK: #endif len = sizeof(u32); break; case NFT_META_NFTRACE: len = sizeof(u8); break; case NFT_META_PKTTYPE: len = sizeof(u8); break; default: return -EOPNOTSUPP; } priv->len = len; err = nft_parse_register_load(ctx, tb[NFTA_META_SREG], &priv->sreg, len); if (err < 0) return err; if (priv->key == NFT_META_NFTRACE) static_branch_inc(&nft_trace_enabled); return 0; } EXPORT_SYMBOL_GPL(nft_meta_set_init); int nft_meta_get_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_meta *priv = nft_expr_priv(expr); if (nla_put_be32(skb, NFTA_META_KEY, htonl(priv->key))) goto nla_put_failure; if (nft_dump_register(skb, NFTA_META_DREG, priv->dreg)) goto nla_put_failure; return 0; nla_put_failure: return -1; } EXPORT_SYMBOL_GPL(nft_meta_get_dump); int nft_meta_set_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_meta *priv = nft_expr_priv(expr); if (nla_put_be32(skb, NFTA_META_KEY, htonl(priv->key))) goto nla_put_failure; if (nft_dump_register(skb, NFTA_META_SREG, priv->sreg)) goto nla_put_failure; return 0; nla_put_failure: return -1; } EXPORT_SYMBOL_GPL(nft_meta_set_dump); void nft_meta_set_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_meta *priv = nft_expr_priv(expr); if (priv->key == NFT_META_NFTRACE) static_branch_dec(&nft_trace_enabled); } EXPORT_SYMBOL_GPL(nft_meta_set_destroy); static int nft_meta_get_offload(struct nft_offload_ctx *ctx, struct nft_flow_rule *flow, const struct nft_expr *expr) { const struct nft_meta *priv = nft_expr_priv(expr); struct nft_offload_reg *reg = &ctx->regs[priv->dreg]; switch (priv->key) { case NFT_META_PROTOCOL: NFT_OFFLOAD_MATCH_EXACT(FLOW_DISSECTOR_KEY_BASIC, basic, n_proto, sizeof(__u16), reg); nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_NETWORK); break; case NFT_META_L4PROTO: NFT_OFFLOAD_MATCH_EXACT(FLOW_DISSECTOR_KEY_BASIC, basic, ip_proto, sizeof(__u8), reg); nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_TRANSPORT); break; case NFT_META_IIF: NFT_OFFLOAD_MATCH_EXACT(FLOW_DISSECTOR_KEY_META, meta, ingress_ifindex, sizeof(__u32), reg); break; case NFT_META_IIFTYPE: NFT_OFFLOAD_MATCH_EXACT(FLOW_DISSECTOR_KEY_META, meta, ingress_iftype, sizeof(__u16), reg); break; default: return -EOPNOTSUPP; } return 0; } bool nft_meta_get_reduce(struct nft_regs_track *track, const struct nft_expr *expr) { const struct nft_meta *priv = nft_expr_priv(expr); const struct nft_meta *meta; if (!nft_reg_track_cmp(track, expr, priv->dreg)) { nft_reg_track_update(track, expr, priv->dreg, priv->len); return false; } meta = nft_expr_priv(track->regs[priv->dreg].selector); if (priv->key != meta->key || priv->dreg != meta->dreg) { nft_reg_track_update(track, expr, priv->dreg, priv->len); return false; } if (!track->regs[priv->dreg].bitwise) return true; return nft_expr_reduce_bitwise(track, expr); } EXPORT_SYMBOL_GPL(nft_meta_get_reduce); static const struct nft_expr_ops nft_meta_get_ops = { .type = &nft_meta_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)), .eval = nft_meta_get_eval, .init = nft_meta_get_init, .dump = nft_meta_get_dump, .reduce = nft_meta_get_reduce, .validate = nft_meta_get_validate, .offload = nft_meta_get_offload, }; static bool nft_meta_set_reduce(struct nft_regs_track *track, const struct nft_expr *expr) { int i; for (i = 0; i < NFT_REG32_NUM; i++) { if (!track->regs[i].selector) continue; if (track->regs[i].selector->ops != &nft_meta_get_ops) continue; __nft_reg_track_cancel(track, i); } return false; } static const struct nft_expr_ops nft_meta_set_ops = { .type = &nft_meta_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)), .eval = nft_meta_set_eval, .init = nft_meta_set_init, .destroy = nft_meta_set_destroy, .dump = nft_meta_set_dump, .reduce = nft_meta_set_reduce, .validate = nft_meta_set_validate, }; static const struct nft_expr_ops * nft_meta_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { if (tb[NFTA_META_KEY] == NULL) return ERR_PTR(-EINVAL); if (tb[NFTA_META_DREG] && tb[NFTA_META_SREG]) return ERR_PTR(-EINVAL); #if IS_ENABLED(CONFIG_NF_TABLES_BRIDGE) && IS_MODULE(CONFIG_NFT_BRIDGE_META) if (ctx->family == NFPROTO_BRIDGE) return ERR_PTR(-EAGAIN); #endif if (tb[NFTA_META_DREG]) return &nft_meta_get_ops; if (tb[NFTA_META_SREG]) return &nft_meta_set_ops; return ERR_PTR(-EINVAL); } static int nft_meta_inner_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_meta *priv = nft_expr_priv(expr); unsigned int len; if (!tb[NFTA_META_KEY] || !tb[NFTA_META_DREG]) return -EINVAL; priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY])); switch (priv->key) { case NFT_META_PROTOCOL: len = sizeof(u16); break; case NFT_META_L4PROTO: len = sizeof(u32); break; default: return -EOPNOTSUPP; } priv->len = len; return nft_parse_register_store(ctx, tb[NFTA_META_DREG], &priv->dreg, NULL, NFT_DATA_VALUE, len); } void nft_meta_inner_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt, struct nft_inner_tun_ctx *tun_ctx) { const struct nft_meta *priv = nft_expr_priv(expr); u32 *dest = &regs->data[priv->dreg]; switch (priv->key) { case NFT_META_PROTOCOL: nft_reg_store16(dest, (__force u16)tun_ctx->llproto); break; case NFT_META_L4PROTO: if (!(tun_ctx->flags & NFT_PAYLOAD_CTX_INNER_TH)) goto err; nft_reg_store8(dest, tun_ctx->l4proto); break; default: WARN_ON_ONCE(1); goto err; } return; err: regs->verdict.code = NFT_BREAK; } EXPORT_SYMBOL_GPL(nft_meta_inner_eval); static const struct nft_expr_ops nft_meta_inner_ops = { .type = &nft_meta_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)), .init = nft_meta_inner_init, .dump = nft_meta_get_dump, /* direct call to nft_meta_inner_eval(). */ }; struct nft_expr_type nft_meta_type __read_mostly = { .name = "meta", .select_ops = nft_meta_select_ops, .inner_ops = &nft_meta_inner_ops, .policy = nft_meta_policy, .maxattr = NFTA_META_MAX, .owner = THIS_MODULE, }; #ifdef CONFIG_NETWORK_SECMARK struct nft_secmark { u32 secid; char *ctx; }; static const struct nla_policy nft_secmark_policy[NFTA_SECMARK_MAX + 1] = { [NFTA_SECMARK_CTX] = { .type = NLA_STRING, .len = NFT_SECMARK_CTX_MAXLEN }, }; static int nft_secmark_compute_secid(struct nft_secmark *priv) { u32 tmp_secid = 0; int err; err = security_secctx_to_secid(priv->ctx, strlen(priv->ctx), &tmp_secid); if (err) return err; if (!tmp_secid) return -ENOENT; err = security_secmark_relabel_packet(tmp_secid); if (err) return err; priv->secid = tmp_secid; return 0; } static void nft_secmark_obj_eval(struct nft_object *obj, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_secmark *priv = nft_obj_data(obj); struct sk_buff *skb = pkt->skb; skb->secmark = priv->secid; } static int nft_secmark_obj_init(const struct nft_ctx *ctx, const struct nlattr * const tb[], struct nft_object *obj) { struct nft_secmark *priv = nft_obj_data(obj); int err; if (tb[NFTA_SECMARK_CTX] == NULL) return -EINVAL; priv->ctx = nla_strdup(tb[NFTA_SECMARK_CTX], GFP_KERNEL_ACCOUNT); if (!priv->ctx) return -ENOMEM; err = nft_secmark_compute_secid(priv); if (err) { kfree(priv->ctx); return err; } security_secmark_refcount_inc(); return 0; } static int nft_secmark_obj_dump(struct sk_buff *skb, struct nft_object *obj, bool reset) { struct nft_secmark *priv = nft_obj_data(obj); int err; if (nla_put_string(skb, NFTA_SECMARK_CTX, priv->ctx)) return -1; if (reset) { err = nft_secmark_compute_secid(priv); if (err) return err; } return 0; } static void nft_secmark_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj) { struct nft_secmark *priv = nft_obj_data(obj); security_secmark_refcount_dec(); kfree(priv->ctx); } static const struct nft_object_ops nft_secmark_obj_ops = { .type = &nft_secmark_obj_type, .size = sizeof(struct nft_secmark), .init = nft_secmark_obj_init, .eval = nft_secmark_obj_eval, .dump = nft_secmark_obj_dump, .destroy = nft_secmark_obj_destroy, }; struct nft_object_type nft_secmark_obj_type __read_mostly = { .type = NFT_OBJECT_SECMARK, .ops = &nft_secmark_obj_ops, .maxattr = NFTA_SECMARK_MAX, .policy = nft_secmark_policy, .owner = THIS_MODULE, }; #endif /* CONFIG_NETWORK_SECMARK */
7 7 7 7 7 4 4 2 4 21 4 1 3 12 4 4 1 7 10 1 17 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 /* * net/tipc/net.c: TIPC network routing code * * Copyright (c) 1995-2006, 2014, Ericsson AB * Copyright (c) 2005, 2010-2011, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "core.h" #include "net.h" #include "name_distr.h" #include "subscr.h" #include "socket.h" #include "node.h" #include "bcast.h" #include "link.h" #include "netlink.h" #include "monitor.h" /* * The TIPC locking policy is designed to ensure a very fine locking * granularity, permitting complete parallel access to individual * port and node/link instances. The code consists of four major * locking domains, each protected with their own disjunct set of locks. * * 1: The bearer level. * RTNL lock is used to serialize the process of configuring bearer * on update side, and RCU lock is applied on read side to make * bearer instance valid on both paths of message transmission and * reception. * * 2: The node and link level. * All node instances are saved into two tipc_node_list and node_htable * lists. The two lists are protected by node_list_lock on write side, * and they are guarded with RCU lock on read side. Especially node * instance is destroyed only when TIPC module is removed, and we can * confirm that there has no any user who is accessing the node at the * moment. Therefore, Except for iterating the two lists within RCU * protection, it's no needed to hold RCU that we access node instance * in other places. * * In addition, all members in node structure including link instances * are protected by node spin lock. * * 3: The transport level of the protocol. * This consists of the structures port, (and its user level * representations, such as user_port and tipc_sock), reference and * tipc_user (port.c, reg.c, socket.c). * * This layer has four different locks: * - The tipc_port spin_lock. This is protecting each port instance * from parallel data access and removal. Since we can not place * this lock in the port itself, it has been placed in the * corresponding reference table entry, which has the same life * cycle as the module. This entry is difficult to access from * outside the TIPC core, however, so a pointer to the lock has * been added in the port instance, -to be used for unlocking * only. * - A read/write lock to protect the reference table itself (teg.c). * (Nobody is using read-only access to this, so it can just as * well be changed to a spin_lock) * - A spin lock to protect the registry of kernel/driver users (reg.c) * - A global spin_lock (tipc_port_lock), which only task is to ensure * consistency where more than one port is involved in an operation, * i.e., when a port is part of a linked list of ports. * There are two such lists; 'port_list', which is used for management, * and 'wait_list', which is used to queue ports during congestion. * * 4: The name table (name_table.c, name_distr.c, subscription.c) * - There is one big read/write-lock (tipc_nametbl_lock) protecting the * overall name table structure. Nothing must be added/removed to * this structure without holding write access to it. * - There is one local spin_lock per sub_sequence, which can be seen * as a sub-domain to the tipc_nametbl_lock domain. It is used only * for translation operations, and is needed because a translation * steps the root of the 'publication' linked list between each lookup. * This is always used within the scope of a tipc_nametbl_lock(read). * - A local spin_lock protecting the queue of subscriber events. */ static void tipc_net_finalize(struct net *net, u32 addr); int tipc_net_init(struct net *net, u8 *node_id, u32 addr) { if (tipc_own_id(net)) { pr_info("Cannot configure node identity twice\n"); return -1; } pr_info("Started in network mode\n"); if (node_id) tipc_set_node_id(net, node_id); if (addr) tipc_net_finalize(net, addr); return 0; } static void tipc_net_finalize(struct net *net, u32 addr) { struct tipc_net *tn = tipc_net(net); struct tipc_socket_addr sk = {0, addr}; struct tipc_uaddr ua; tipc_uaddr(&ua, TIPC_SERVICE_RANGE, TIPC_CLUSTER_SCOPE, TIPC_NODE_STATE, addr, addr); if (cmpxchg(&tn->node_addr, 0, addr)) return; tipc_set_node_addr(net, addr); tipc_named_reinit(net); tipc_sk_reinit(net); tipc_mon_reinit_self(net); tipc_nametbl_publish(net, &ua, &sk, addr); } void tipc_net_finalize_work(struct work_struct *work) { struct tipc_net *tn = container_of(work, struct tipc_net, work); rtnl_lock(); tipc_net_finalize(tipc_link_net(tn->bcl), tn->trial_addr); rtnl_unlock(); } void tipc_net_stop(struct net *net) { if (!tipc_own_id(net)) return; rtnl_lock(); tipc_bearer_stop(net); tipc_node_stop(net); rtnl_unlock(); pr_info("Left network mode\n"); } static int __tipc_nl_add_net(struct net *net, struct tipc_nl_msg *msg) { struct tipc_net *tn = net_generic(net, tipc_net_id); u64 *w0 = (u64 *)&tn->node_id[0]; u64 *w1 = (u64 *)&tn->node_id[8]; struct nlattr *attrs; void *hdr; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, NLM_F_MULTI, TIPC_NL_NET_GET); if (!hdr) return -EMSGSIZE; attrs = nla_nest_start_noflag(msg->skb, TIPC_NLA_NET); if (!attrs) goto msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_NET_ID, tn->net_id)) goto attr_msg_full; if (nla_put_u64_64bit(msg->skb, TIPC_NLA_NET_NODEID, *w0, 0)) goto attr_msg_full; if (nla_put_u64_64bit(msg->skb, TIPC_NLA_NET_NODEID_W1, *w1, 0)) goto attr_msg_full; nla_nest_end(msg->skb, attrs); genlmsg_end(msg->skb, hdr); return 0; attr_msg_full: nla_nest_cancel(msg->skb, attrs); msg_full: genlmsg_cancel(msg->skb, hdr); return -EMSGSIZE; } int tipc_nl_net_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); int err; int done = cb->args[0]; struct tipc_nl_msg msg; if (done) return 0; msg.skb = skb; msg.portid = NETLINK_CB(cb->skb).portid; msg.seq = cb->nlh->nlmsg_seq; err = __tipc_nl_add_net(net, &msg); if (err) goto out; done = 1; out: cb->args[0] = done; return skb->len; } int __tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info) { struct nlattr *attrs[TIPC_NLA_NET_MAX + 1]; struct net *net = sock_net(skb->sk); struct tipc_net *tn = tipc_net(net); int err; if (!info->attrs[TIPC_NLA_NET]) return -EINVAL; err = nla_parse_nested_deprecated(attrs, TIPC_NLA_NET_MAX, info->attrs[TIPC_NLA_NET], tipc_nl_net_policy, info->extack); if (err) return err; /* Can't change net id once TIPC has joined a network */ if (tipc_own_addr(net)) return -EPERM; if (attrs[TIPC_NLA_NET_ID]) { u32 val; val = nla_get_u32(attrs[TIPC_NLA_NET_ID]); if (val < 1 || val > 9999) return -EINVAL; tn->net_id = val; } if (attrs[TIPC_NLA_NET_ADDR]) { u32 addr; addr = nla_get_u32(attrs[TIPC_NLA_NET_ADDR]); if (!addr) return -EINVAL; tn->legacy_addr_format = true; tipc_net_init(net, NULL, addr); } if (attrs[TIPC_NLA_NET_NODEID]) { u8 node_id[NODE_ID_LEN]; u64 *w0 = (u64 *)&node_id[0]; u64 *w1 = (u64 *)&node_id[8]; if (!attrs[TIPC_NLA_NET_NODEID_W1]) return -EINVAL; *w0 = nla_get_u64(attrs[TIPC_NLA_NET_NODEID]); *w1 = nla_get_u64(attrs[TIPC_NLA_NET_NODEID_W1]); tipc_net_init(net, node_id, 0); } return 0; } int tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info) { int err; rtnl_lock(); err = __tipc_nl_net_set(skb, info); rtnl_unlock(); return err; } static int __tipc_nl_addr_legacy_get(struct net *net, struct tipc_nl_msg *msg) { struct tipc_net *tn = tipc_net(net); struct nlattr *attrs; void *hdr; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, 0, TIPC_NL_ADDR_LEGACY_GET); if (!hdr) return -EMSGSIZE; attrs = nla_nest_start(msg->skb, TIPC_NLA_NET); if (!attrs) goto msg_full; if (tn->legacy_addr_format) if (nla_put_flag(msg->skb, TIPC_NLA_NET_ADDR_LEGACY)) goto attr_msg_full; nla_nest_end(msg->skb, attrs); genlmsg_end(msg->skb, hdr); return 0; attr_msg_full: nla_nest_cancel(msg->skb, attrs); msg_full: genlmsg_cancel(msg->skb, hdr); return -EMSGSIZE; } int tipc_nl_net_addr_legacy_get(struct sk_buff *skb, struct genl_info *info) { struct net *net = sock_net(skb->sk); struct tipc_nl_msg msg; struct sk_buff *rep; int err; rep = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (!rep) return -ENOMEM; msg.skb = rep; msg.portid = info->snd_portid; msg.seq = info->snd_seq; err = __tipc_nl_addr_legacy_get(net, &msg); if (err) { nlmsg_free(msg.skb); return err; } return genlmsg_reply(msg.skb, info); }
26237 523 26166 26179 17746 675 673 674 22387 1214 1213 1212 4769 5707 20691 4979 4774 5710 20682 20603 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 /* SPDX-License-Identifier: GPL-2.0-only */ /* * AppArmor security module * * This file contains AppArmor contexts used to associate "labels" to objects. * * Copyright (C) 1998-2008 Novell/SUSE * Copyright 2009-2010 Canonical Ltd. */ #ifndef __AA_CONTEXT_H #define __AA_CONTEXT_H #include <linux/cred.h> #include <linux/slab.h> #include <linux/sched.h> #include "label.h" #include "policy_ns.h" #include "task.h" static inline struct aa_label *cred_label(const struct cred *cred) { struct aa_label **blob = cred->security + apparmor_blob_sizes.lbs_cred; AA_BUG(!blob); return *blob; } static inline void set_cred_label(const struct cred *cred, struct aa_label *label) { struct aa_label **blob = cred->security + apparmor_blob_sizes.lbs_cred; AA_BUG(!blob); *blob = label; } /** * aa_cred_raw_label - obtain cred's label * @cred: cred to obtain label from (NOT NULL) * * Returns: confining label * * does NOT increment reference count */ static inline struct aa_label *aa_cred_raw_label(const struct cred *cred) { struct aa_label *label = cred_label(cred); AA_BUG(!label); return label; } /** * aa_get_newest_cred_label - obtain the newest label on a cred * @cred: cred to obtain label from (NOT NULL) * * Returns: newest version of confining label */ static inline struct aa_label *aa_get_newest_cred_label(const struct cred *cred) { return aa_get_newest_label(aa_cred_raw_label(cred)); } static inline struct aa_label *aa_get_newest_cred_label_condref(const struct cred *cred, bool *needput) { struct aa_label *l = aa_cred_raw_label(cred); if (unlikely(label_is_stale(l))) { *needput = true; return aa_get_newest_label(l); } *needput = false; return l; } static inline void aa_put_label_condref(struct aa_label *l, bool needput) { if (unlikely(needput)) aa_put_label(l); } /** * aa_current_raw_label - find the current tasks confining label * * Returns: up to date confining label or the ns unconfined label (NOT NULL) * * This fn will not update the tasks cred to the most up to date version * of the label so it is safe to call when inside of locks. */ static inline struct aa_label *aa_current_raw_label(void) { return aa_cred_raw_label(current_cred()); } /** * aa_get_current_label - get the newest version of the current tasks label * * Returns: newest version of confining label (NOT NULL) * * This fn will not update the tasks cred, so it is safe inside of locks * * The returned reference must be put with aa_put_label() */ static inline struct aa_label *aa_get_current_label(void) { struct aa_label *l = aa_current_raw_label(); if (label_is_stale(l)) return aa_get_newest_label(l); return aa_get_label(l); } /** * __end_current_label_crit_section - end crit section begun with __begin_... * @label: label obtained from __begin_current_label_crit_section * @needput: output: bool set by __begin_current_label_crit_section * * Returns: label to use for this crit section */ static inline void __end_current_label_crit_section(struct aa_label *label, bool needput) { if (unlikely(needput)) aa_put_label(label); } /** * end_current_label_crit_section - put a reference found with begin_current_label.. * @label: label reference to put * * Should only be used with a reference obtained with * begin_current_label_crit_section and never used in situations where the * task cred may be updated */ static inline void end_current_label_crit_section(struct aa_label *label) { if (label != aa_current_raw_label()) aa_put_label(label); } /** * __begin_current_label_crit_section - current's confining label * @needput: store whether the label needs to be put when ending crit section * * Returns: up to date confining label or the ns unconfined label (NOT NULL) * * safe to call inside locks * * The returned reference must be put with __end_current_label_crit_section() * This must NOT be used if the task cred could be updated within the * critical section between __begin_current_label_crit_section() .. * __end_current_label_crit_section() */ static inline struct aa_label *__begin_current_label_crit_section(bool *needput) { struct aa_label *label = aa_current_raw_label(); if (label_is_stale(label)) { *needput = true; return aa_get_newest_label(label); } *needput = false; return label; } /** * begin_current_label_crit_section - current's confining label and update it * * Returns: up to date confining label or the ns unconfined label (NOT NULL) * * Not safe to call inside locks * * The returned reference must be put with end_current_label_crit_section() * This must NOT be used if the task cred could be updated within the * critical section between begin_current_label_crit_section() .. * end_current_label_crit_section() */ static inline struct aa_label *begin_current_label_crit_section(void) { struct aa_label *label = aa_current_raw_label(); might_sleep(); if (label_is_stale(label)) { label = aa_get_newest_label(label); if (aa_replace_current_label(label) == 0) /* task cred will keep the reference */ aa_put_label(label); } return label; } static inline struct aa_ns *aa_get_current_ns(void) { struct aa_label *label; struct aa_ns *ns; bool needput; label = __begin_current_label_crit_section(&needput); ns = aa_get_ns(labels_ns(label)); __end_current_label_crit_section(label, needput); return ns; } #endif /* __AA_CONTEXT_H */
6 6 6 20 22 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 // SPDX-License-Identifier: GPL-2.0-or-later /* * Crypto API support for SHA-224, SHA-256, HMAC-SHA224, and HMAC-SHA256 * * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com> * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk> * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> * SHA224 Support Copyright 2007 Intel Corporation <jonathan.lynch@intel.com> * Copyright 2025 Google LLC */ #include <crypto/internal/hash.h> #include <crypto/sha2.h> #include <linux/kernel.h> #include <linux/module.h> /* * Export and import functions. crypto_shash wants a particular format that * matches that used by some legacy drivers. It currently is the same as the * library SHA context, except the value in bytecount must be block-aligned and * the remainder must be stored in an extra u8 appended to the struct. */ #define SHA256_SHASH_STATE_SIZE 105 static_assert(offsetof(struct __sha256_ctx, state) == 0); static_assert(offsetof(struct __sha256_ctx, bytecount) == 32); static_assert(offsetof(struct __sha256_ctx, buf) == 40); static_assert(sizeof(struct __sha256_ctx) + 1 == SHA256_SHASH_STATE_SIZE); static int __crypto_sha256_export(const struct __sha256_ctx *ctx0, void *out) { struct __sha256_ctx ctx = *ctx0; unsigned int partial; u8 *p = out; partial = ctx.bytecount % SHA256_BLOCK_SIZE; ctx.bytecount -= partial; memcpy(p, &ctx, sizeof(ctx)); p += sizeof(ctx); *p = partial; return 0; } static int __crypto_sha256_import(struct __sha256_ctx *ctx, const void *in) { const u8 *p = in; memcpy(ctx, p, sizeof(*ctx)); p += sizeof(*ctx); ctx->bytecount += *p; return 0; } static int __crypto_sha256_export_core(const struct __sha256_ctx *ctx, void *out) { memcpy(out, ctx, offsetof(struct __sha256_ctx, buf)); return 0; } static int __crypto_sha256_import_core(struct __sha256_ctx *ctx, const void *in) { memcpy(ctx, in, offsetof(struct __sha256_ctx, buf)); return 0; } /* SHA-224 */ const u8 sha224_zero_message_hash[SHA224_DIGEST_SIZE] = { 0xd1, 0x4a, 0x02, 0x8c, 0x2a, 0x3a, 0x2b, 0xc9, 0x47, 0x61, 0x02, 0xbb, 0x28, 0x82, 0x34, 0xc4, 0x15, 0xa2, 0xb0, 0x1f, 0x82, 0x8e, 0xa6, 0x2a, 0xc5, 0xb3, 0xe4, 0x2f }; EXPORT_SYMBOL_GPL(sha224_zero_message_hash); #define SHA224_CTX(desc) ((struct sha224_ctx *)shash_desc_ctx(desc)) static int crypto_sha224_init(struct shash_desc *desc) { sha224_init(SHA224_CTX(desc)); return 0; } static int crypto_sha224_update(struct shash_desc *desc, const u8 *data, unsigned int len) { sha224_update(SHA224_CTX(desc), data, len); return 0; } static int crypto_sha224_final(struct shash_desc *desc, u8 *out) { sha224_final(SHA224_CTX(desc), out); return 0; } static int crypto_sha224_digest(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { sha224(data, len, out); return 0; } static int crypto_sha224_export(struct shash_desc *desc, void *out) { return __crypto_sha256_export(&SHA224_CTX(desc)->ctx, out); } static int crypto_sha224_import(struct shash_desc *desc, const void *in) { return __crypto_sha256_import(&SHA224_CTX(desc)->ctx, in); } static int crypto_sha224_export_core(struct shash_desc *desc, void *out) { return __crypto_sha256_export_core(&SHA224_CTX(desc)->ctx, out); } static int crypto_sha224_import_core(struct shash_desc *desc, const void *in) { return __crypto_sha256_import_core(&SHA224_CTX(desc)->ctx, in); } /* SHA-256 */ const u8 sha256_zero_message_hash[SHA256_DIGEST_SIZE] = { 0xe3, 0xb0, 0xc4, 0x42, 0x98, 0xfc, 0x1c, 0x14, 0x9a, 0xfb, 0xf4, 0xc8, 0x99, 0x6f, 0xb9, 0x24, 0x27, 0xae, 0x41, 0xe4, 0x64, 0x9b, 0x93, 0x4c, 0xa4, 0x95, 0x99, 0x1b, 0x78, 0x52, 0xb8, 0x55 }; EXPORT_SYMBOL_GPL(sha256_zero_message_hash); #define SHA256_CTX(desc) ((struct sha256_ctx *)shash_desc_ctx(desc)) static int crypto_sha256_init(struct shash_desc *desc) { sha256_init(SHA256_CTX(desc)); return 0; } static int crypto_sha256_update(struct shash_desc *desc, const u8 *data, unsigned int len) { sha256_update(SHA256_CTX(desc), data, len); return 0; } static int crypto_sha256_final(struct shash_desc *desc, u8 *out) { sha256_final(SHA256_CTX(desc), out); return 0; } static int crypto_sha256_digest(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { sha256(data, len, out); return 0; } static int crypto_sha256_export(struct shash_desc *desc, void *out) { return __crypto_sha256_export(&SHA256_CTX(desc)->ctx, out); } static int crypto_sha256_import(struct shash_desc *desc, const void *in) { return __crypto_sha256_import(&SHA256_CTX(desc)->ctx, in); } static int crypto_sha256_export_core(struct shash_desc *desc, void *out) { return __crypto_sha256_export_core(&SHA256_CTX(desc)->ctx, out); } static int crypto_sha256_import_core(struct shash_desc *desc, const void *in) { return __crypto_sha256_import_core(&SHA256_CTX(desc)->ctx, in); } /* HMAC-SHA224 */ #define HMAC_SHA224_KEY(tfm) ((struct hmac_sha224_key *)crypto_shash_ctx(tfm)) #define HMAC_SHA224_CTX(desc) ((struct hmac_sha224_ctx *)shash_desc_ctx(desc)) static int crypto_hmac_sha224_setkey(struct crypto_shash *tfm, const u8 *raw_key, unsigned int keylen) { hmac_sha224_preparekey(HMAC_SHA224_KEY(tfm), raw_key, keylen); return 0; } static int crypto_hmac_sha224_init(struct shash_desc *desc) { hmac_sha224_init(HMAC_SHA224_CTX(desc), HMAC_SHA224_KEY(desc->tfm)); return 0; } static int crypto_hmac_sha224_update(struct shash_desc *desc, const u8 *data, unsigned int len) { hmac_sha224_update(HMAC_SHA224_CTX(desc), data, len); return 0; } static int crypto_hmac_sha224_final(struct shash_desc *desc, u8 *out) { hmac_sha224_final(HMAC_SHA224_CTX(desc), out); return 0; } static int crypto_hmac_sha224_digest(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { hmac_sha224(HMAC_SHA224_KEY(desc->tfm), data, len, out); return 0; } static int crypto_hmac_sha224_export(struct shash_desc *desc, void *out) { return __crypto_sha256_export(&HMAC_SHA224_CTX(desc)->ctx.sha_ctx, out); } static int crypto_hmac_sha224_import(struct shash_desc *desc, const void *in) { struct hmac_sha224_ctx *ctx = HMAC_SHA224_CTX(desc); ctx->ctx.ostate = HMAC_SHA224_KEY(desc->tfm)->key.ostate; return __crypto_sha256_import(&ctx->ctx.sha_ctx, in); } static int crypto_hmac_sha224_export_core(struct shash_desc *desc, void *out) { return __crypto_sha256_export_core(&HMAC_SHA224_CTX(desc)->ctx.sha_ctx, out); } static int crypto_hmac_sha224_import_core(struct shash_desc *desc, const void *in) { struct hmac_sha224_ctx *ctx = HMAC_SHA224_CTX(desc); ctx->ctx.ostate = HMAC_SHA224_KEY(desc->tfm)->key.ostate; return __crypto_sha256_import_core(&ctx->ctx.sha_ctx, in); } /* HMAC-SHA256 */ #define HMAC_SHA256_KEY(tfm) ((struct hmac_sha256_key *)crypto_shash_ctx(tfm)) #define HMAC_SHA256_CTX(desc) ((struct hmac_sha256_ctx *)shash_desc_ctx(desc)) static int crypto_hmac_sha256_setkey(struct crypto_shash *tfm, const u8 *raw_key, unsigned int keylen) { hmac_sha256_preparekey(HMAC_SHA256_KEY(tfm), raw_key, keylen); return 0; } static int crypto_hmac_sha256_init(struct shash_desc *desc) { hmac_sha256_init(HMAC_SHA256_CTX(desc), HMAC_SHA256_KEY(desc->tfm)); return 0; } static int crypto_hmac_sha256_update(struct shash_desc *desc, const u8 *data, unsigned int len) { hmac_sha256_update(HMAC_SHA256_CTX(desc), data, len); return 0; } static int crypto_hmac_sha256_final(struct shash_desc *desc, u8 *out) { hmac_sha256_final(HMAC_SHA256_CTX(desc), out); return 0; } static int crypto_hmac_sha256_digest(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { hmac_sha256(HMAC_SHA256_KEY(desc->tfm), data, len, out); return 0; } static int crypto_hmac_sha256_export(struct shash_desc *desc, void *out) { return __crypto_sha256_export(&HMAC_SHA256_CTX(desc)->ctx.sha_ctx, out); } static int crypto_hmac_sha256_import(struct shash_desc *desc, const void *in) { struct hmac_sha256_ctx *ctx = HMAC_SHA256_CTX(desc); ctx->ctx.ostate = HMAC_SHA256_KEY(desc->tfm)->key.ostate; return __crypto_sha256_import(&ctx->ctx.sha_ctx, in); } static int crypto_hmac_sha256_export_core(struct shash_desc *desc, void *out) { return __crypto_sha256_export_core(&HMAC_SHA256_CTX(desc)->ctx.sha_ctx, out); } static int crypto_hmac_sha256_import_core(struct shash_desc *desc, const void *in) { struct hmac_sha256_ctx *ctx = HMAC_SHA256_CTX(desc); ctx->ctx.ostate = HMAC_SHA256_KEY(desc->tfm)->key.ostate; return __crypto_sha256_import_core(&ctx->ctx.sha_ctx, in); } /* Algorithm definitions */ static struct shash_alg algs[] = { { .base.cra_name = "sha224", .base.cra_driver_name = "sha224-lib", .base.cra_priority = 300, .base.cra_blocksize = SHA224_BLOCK_SIZE, .base.cra_module = THIS_MODULE, .digestsize = SHA224_DIGEST_SIZE, .init = crypto_sha224_init, .update = crypto_sha224_update, .final = crypto_sha224_final, .digest = crypto_sha224_digest, .export = crypto_sha224_export, .import = crypto_sha224_import, .export_core = crypto_sha224_export_core, .import_core = crypto_sha224_import_core, .descsize = sizeof(struct sha224_ctx), .statesize = SHA256_SHASH_STATE_SIZE, }, { .base.cra_name = "sha256", .base.cra_driver_name = "sha256-lib", .base.cra_priority = 300, .base.cra_blocksize = SHA256_BLOCK_SIZE, .base.cra_module = THIS_MODULE, .digestsize = SHA256_DIGEST_SIZE, .init = crypto_sha256_init, .update = crypto_sha256_update, .final = crypto_sha256_final, .digest = crypto_sha256_digest, .export = crypto_sha256_export, .import = crypto_sha256_import, .export_core = crypto_sha256_export_core, .import_core = crypto_sha256_import_core, .descsize = sizeof(struct sha256_ctx), .statesize = SHA256_SHASH_STATE_SIZE, }, { .base.cra_name = "hmac(sha224)", .base.cra_driver_name = "hmac-sha224-lib", .base.cra_priority = 300, .base.cra_blocksize = SHA224_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct hmac_sha224_key), .base.cra_module = THIS_MODULE, .digestsize = SHA224_DIGEST_SIZE, .setkey = crypto_hmac_sha224_setkey, .init = crypto_hmac_sha224_init, .update = crypto_hmac_sha224_update, .final = crypto_hmac_sha224_final, .digest = crypto_hmac_sha224_digest, .export = crypto_hmac_sha224_export, .import = crypto_hmac_sha224_import, .export_core = crypto_hmac_sha224_export_core, .import_core = crypto_hmac_sha224_import_core, .descsize = sizeof(struct hmac_sha224_ctx), .statesize = SHA256_SHASH_STATE_SIZE, }, { .base.cra_name = "hmac(sha256)", .base.cra_driver_name = "hmac-sha256-lib", .base.cra_priority = 300, .base.cra_blocksize = SHA256_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct hmac_sha256_key), .base.cra_module = THIS_MODULE, .digestsize = SHA256_DIGEST_SIZE, .setkey = crypto_hmac_sha256_setkey, .init = crypto_hmac_sha256_init, .update = crypto_hmac_sha256_update, .final = crypto_hmac_sha256_final, .digest = crypto_hmac_sha256_digest, .export = crypto_hmac_sha256_export, .import = crypto_hmac_sha256_import, .export_core = crypto_hmac_sha256_export_core, .import_core = crypto_hmac_sha256_import_core, .descsize = sizeof(struct hmac_sha256_ctx), .statesize = SHA256_SHASH_STATE_SIZE, }, }; static int __init crypto_sha256_mod_init(void) { return crypto_register_shashes(algs, ARRAY_SIZE(algs)); } module_init(crypto_sha256_mod_init); static void __exit crypto_sha256_mod_exit(void) { crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); } module_exit(crypto_sha256_mod_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Crypto API support for SHA-224, SHA-256, HMAC-SHA224, and HMAC-SHA256"); MODULE_ALIAS_CRYPTO("sha224"); MODULE_ALIAS_CRYPTO("sha224-lib"); MODULE_ALIAS_CRYPTO("sha256"); MODULE_ALIAS_CRYPTO("sha256-lib"); MODULE_ALIAS_CRYPTO("hmac(sha224)"); MODULE_ALIAS_CRYPTO("hmac-sha224-lib"); MODULE_ALIAS_CRYPTO("hmac(sha256)"); MODULE_ALIAS_CRYPTO("hmac-sha256-lib");
38 38 18 16 2 37 38 33 5 16 38 19 22 2 2 38 5 32 33 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2007-2014 Nicira, Inc. */ #include <linux/etherdevice.h> #include <linux/if.h> #include <linux/if_vlan.h> #include <linux/jhash.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/mutex.h> #include <linux/percpu.h> #include <linux/rcupdate.h> #include <linux/rtnetlink.h> #include <linux/compat.h> #include <net/net_namespace.h> #include <linux/module.h> #include "datapath.h" #include "vport.h" #include "vport-internal_dev.h" static LIST_HEAD(vport_ops_list); /* Protected by RCU read lock for reading, ovs_mutex for writing. */ static struct hlist_head *dev_table; #define VPORT_HASH_BUCKETS 1024 /** * ovs_vport_init - initialize vport subsystem * * Called at module load time to initialize the vport subsystem. */ int ovs_vport_init(void) { dev_table = kcalloc(VPORT_HASH_BUCKETS, sizeof(struct hlist_head), GFP_KERNEL); if (!dev_table) return -ENOMEM; return 0; } /** * ovs_vport_exit - shutdown vport subsystem * * Called at module exit time to shutdown the vport subsystem. */ void ovs_vport_exit(void) { kfree(dev_table); } static struct hlist_head *hash_bucket(const struct net *net, const char *name) { unsigned int hash = jhash(name, strlen(name), (unsigned long) net); return &dev_table[hash & (VPORT_HASH_BUCKETS - 1)]; } int __ovs_vport_ops_register(struct vport_ops *ops) { int err = -EEXIST; struct vport_ops *o; ovs_lock(); list_for_each_entry(o, &vport_ops_list, list) if (ops->type == o->type) goto errout; list_add_tail(&ops->list, &vport_ops_list); err = 0; errout: ovs_unlock(); return err; } EXPORT_SYMBOL_GPL(__ovs_vport_ops_register); void ovs_vport_ops_unregister(struct vport_ops *ops) { ovs_lock(); list_del(&ops->list); ovs_unlock(); } EXPORT_SYMBOL_GPL(ovs_vport_ops_unregister); /** * ovs_vport_locate - find a port that has already been created * * @net: network namespace * @name: name of port to find * * Must be called with ovs or RCU read lock. */ struct vport *ovs_vport_locate(const struct net *net, const char *name) { struct hlist_head *bucket = hash_bucket(net, name); struct vport *vport; hlist_for_each_entry_rcu(vport, bucket, hash_node, lockdep_ovsl_is_held()) if (!strcmp(name, ovs_vport_name(vport)) && net_eq(ovs_dp_get_net(vport->dp), net)) return vport; return NULL; } /** * ovs_vport_alloc - allocate and initialize new vport * * @priv_size: Size of private data area to allocate. * @ops: vport device ops * @parms: information about new vport. * * Allocate and initialize a new vport defined by @ops. The vport will contain * a private data area of size @priv_size that can be accessed using * vport_priv(). Some parameters of the vport will be initialized from @parms. * @vports that are no longer needed should be released with * vport_free(). */ struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops, const struct vport_parms *parms) { struct vport *vport; size_t alloc_size; int err; alloc_size = sizeof(struct vport); if (priv_size) { alloc_size = ALIGN(alloc_size, VPORT_ALIGN); alloc_size += priv_size; } vport = kzalloc(alloc_size, GFP_KERNEL); if (!vport) return ERR_PTR(-ENOMEM); vport->upcall_stats = netdev_alloc_pcpu_stats(struct vport_upcall_stats_percpu); if (!vport->upcall_stats) { err = -ENOMEM; goto err_kfree_vport; } vport->dp = parms->dp; vport->port_no = parms->port_no; vport->ops = ops; INIT_HLIST_NODE(&vport->dp_hash_node); if (ovs_vport_set_upcall_portids(vport, parms->upcall_portids)) { err = -EINVAL; goto err_free_percpu; } return vport; err_free_percpu: free_percpu(vport->upcall_stats); err_kfree_vport: kfree(vport); return ERR_PTR(err); } EXPORT_SYMBOL_GPL(ovs_vport_alloc); /** * ovs_vport_free - uninitialize and free vport * * @vport: vport to free * * Frees a vport allocated with vport_alloc() when it is no longer needed. * * The caller must ensure that an RCU grace period has passed since the last * time @vport was in a datapath. */ void ovs_vport_free(struct vport *vport) { /* vport is freed from RCU callback or error path, Therefore * it is safe to use raw dereference. */ kfree(rcu_dereference_raw(vport->upcall_portids)); free_percpu(vport->upcall_stats); kfree(vport); } EXPORT_SYMBOL_GPL(ovs_vport_free); static struct vport_ops *ovs_vport_lookup(const struct vport_parms *parms) { struct vport_ops *ops; list_for_each_entry(ops, &vport_ops_list, list) if (ops->type == parms->type) return ops; return NULL; } /** * ovs_vport_add - add vport device (for kernel callers) * * @parms: Information about new vport. * * Creates a new vport with the specified configuration (which is dependent on * device type). ovs_mutex must be held. */ struct vport *ovs_vport_add(const struct vport_parms *parms) { struct vport_ops *ops; struct vport *vport; ops = ovs_vport_lookup(parms); if (ops) { struct hlist_head *bucket; if (!try_module_get(ops->owner)) return ERR_PTR(-EAFNOSUPPORT); vport = ops->create(parms); if (IS_ERR(vport)) { module_put(ops->owner); return vport; } bucket = hash_bucket(ovs_dp_get_net(vport->dp), ovs_vport_name(vport)); hlist_add_head_rcu(&vport->hash_node, bucket); return vport; } /* Unlock to attempt module load and return -EAGAIN if load * was successful as we need to restart the port addition * workflow. */ ovs_unlock(); request_module("vport-type-%d", parms->type); ovs_lock(); if (!ovs_vport_lookup(parms)) return ERR_PTR(-EAFNOSUPPORT); else return ERR_PTR(-EAGAIN); } /** * ovs_vport_set_options - modify existing vport device (for kernel callers) * * @vport: vport to modify. * @options: New configuration. * * Modifies an existing device with the specified configuration (which is * dependent on device type). ovs_mutex must be held. */ int ovs_vport_set_options(struct vport *vport, struct nlattr *options) { if (!vport->ops->set_options) return -EOPNOTSUPP; return vport->ops->set_options(vport, options); } /** * ovs_vport_del - delete existing vport device * * @vport: vport to delete. * * Detaches @vport from its datapath and destroys it. ovs_mutex must * be held. */ void ovs_vport_del(struct vport *vport) { hlist_del_rcu(&vport->hash_node); module_put(vport->ops->owner); vport->ops->destroy(vport); } /** * ovs_vport_get_stats - retrieve device stats * * @vport: vport from which to retrieve the stats * @stats: location to store stats * * Retrieves transmit, receive, and error stats for the given device. * * Must be called with ovs_mutex or rcu_read_lock. */ void ovs_vport_get_stats(struct vport *vport, struct ovs_vport_stats *stats) { const struct rtnl_link_stats64 *dev_stats; struct rtnl_link_stats64 temp; dev_stats = dev_get_stats(vport->dev, &temp); stats->rx_errors = dev_stats->rx_errors; stats->tx_errors = dev_stats->tx_errors; stats->tx_dropped = dev_stats->tx_dropped; stats->rx_dropped = dev_stats->rx_dropped; stats->rx_bytes = dev_stats->rx_bytes; stats->rx_packets = dev_stats->rx_packets; stats->tx_bytes = dev_stats->tx_bytes; stats->tx_packets = dev_stats->tx_packets; } /** * ovs_vport_get_upcall_stats - retrieve upcall stats * * @vport: vport from which to retrieve the stats. * @skb: sk_buff where upcall stats should be appended. * * Retrieves upcall stats for the given device. * * Must be called with ovs_mutex or rcu_read_lock. */ int ovs_vport_get_upcall_stats(struct vport *vport, struct sk_buff *skb) { struct nlattr *nla; int i; __u64 tx_success = 0; __u64 tx_fail = 0; for_each_possible_cpu(i) { const struct vport_upcall_stats_percpu *stats; unsigned int start; stats = per_cpu_ptr(vport->upcall_stats, i); do { start = u64_stats_fetch_begin(&stats->syncp); tx_success += u64_stats_read(&stats->n_success); tx_fail += u64_stats_read(&stats->n_fail); } while (u64_stats_fetch_retry(&stats->syncp, start)); } nla = nla_nest_start_noflag(skb, OVS_VPORT_ATTR_UPCALL_STATS); if (!nla) return -EMSGSIZE; if (nla_put_u64_64bit(skb, OVS_VPORT_UPCALL_ATTR_SUCCESS, tx_success, OVS_VPORT_ATTR_PAD)) { nla_nest_cancel(skb, nla); return -EMSGSIZE; } if (nla_put_u64_64bit(skb, OVS_VPORT_UPCALL_ATTR_FAIL, tx_fail, OVS_VPORT_ATTR_PAD)) { nla_nest_cancel(skb, nla); return -EMSGSIZE; } nla_nest_end(skb, nla); return 0; } /** * ovs_vport_get_options - retrieve device options * * @vport: vport from which to retrieve the options. * @skb: sk_buff where options should be appended. * * Retrieves the configuration of the given device, appending an * %OVS_VPORT_ATTR_OPTIONS attribute that in turn contains nested * vport-specific attributes to @skb. * * Returns 0 if successful, -EMSGSIZE if @skb has insufficient room, or another * negative error code if a real error occurred. If an error occurs, @skb is * left unmodified. * * Must be called with ovs_mutex or rcu_read_lock. */ int ovs_vport_get_options(const struct vport *vport, struct sk_buff *skb) { struct nlattr *nla; int err; if (!vport->ops->get_options) return 0; nla = nla_nest_start_noflag(skb, OVS_VPORT_ATTR_OPTIONS); if (!nla) return -EMSGSIZE; err = vport->ops->get_options(vport, skb); if (err) { nla_nest_cancel(skb, nla); return err; } nla_nest_end(skb, nla); return 0; } /** * ovs_vport_set_upcall_portids - set upcall portids of @vport. * * @vport: vport to modify. * @ids: new configuration, an array of port ids. * * Sets the vport's upcall_portids to @ids. * * Returns 0 if successful, -EINVAL if @ids is zero length or cannot be parsed * as an array of U32. * * Must be called with ovs_mutex. */ int ovs_vport_set_upcall_portids(struct vport *vport, const struct nlattr *ids) { struct vport_portids *old, *vport_portids; if (!nla_len(ids) || nla_len(ids) % sizeof(u32)) return -EINVAL; old = ovsl_dereference(vport->upcall_portids); vport_portids = kmalloc(sizeof(*vport_portids) + nla_len(ids), GFP_KERNEL); if (!vport_portids) return -ENOMEM; vport_portids->n_ids = nla_len(ids) / sizeof(u32); vport_portids->rn_ids = reciprocal_value(vport_portids->n_ids); nla_memcpy(vport_portids->ids, ids, nla_len(ids)); rcu_assign_pointer(vport->upcall_portids, vport_portids); if (old) kfree_rcu(old, rcu); return 0; } /** * ovs_vport_get_upcall_portids - get the upcall_portids of @vport. * * @vport: vport from which to retrieve the portids. * @skb: sk_buff where portids should be appended. * * Retrieves the configuration of the given vport, appending the * %OVS_VPORT_ATTR_UPCALL_PID attribute which is the array of upcall * portids to @skb. * * Returns 0 if successful, -EMSGSIZE if @skb has insufficient room. * If an error occurs, @skb is left unmodified. Must be called with * ovs_mutex or rcu_read_lock. */ int ovs_vport_get_upcall_portids(const struct vport *vport, struct sk_buff *skb) { struct vport_portids *ids; ids = rcu_dereference_ovsl(vport->upcall_portids); if (vport->dp->user_features & OVS_DP_F_VPORT_PIDS) return nla_put(skb, OVS_VPORT_ATTR_UPCALL_PID, ids->n_ids * sizeof(u32), (void *)ids->ids); else return nla_put_u32(skb, OVS_VPORT_ATTR_UPCALL_PID, ids->ids[0]); } /** * ovs_vport_find_upcall_portid - find the upcall portid to send upcall. * * @vport: vport from which the missed packet is received. * @skb: skb that the missed packet was received. * * Uses the skb_get_hash() to select the upcall portid to send the * upcall. * * Returns the portid of the target socket. Must be called with rcu_read_lock. */ u32 ovs_vport_find_upcall_portid(const struct vport *vport, struct sk_buff *skb) { struct vport_portids *ids; u32 ids_index; u32 hash; ids = rcu_dereference(vport->upcall_portids); /* If there is only one portid, select it in the fast-path. */ if (ids->n_ids == 1) return ids->ids[0]; hash = skb_get_hash(skb); ids_index = hash - ids->n_ids * reciprocal_divide(hash, ids->rn_ids); return ids->ids[ids_index]; } /** * ovs_vport_receive - pass up received packet to the datapath for processing * * @vport: vport that received the packet * @skb: skb that was received * @tun_info: tunnel (if any) that carried packet * * Must be called with rcu_read_lock. The packet cannot be shared and * skb->data should point to the Ethernet header. */ int ovs_vport_receive(struct vport *vport, struct sk_buff *skb, const struct ip_tunnel_info *tun_info) { struct sw_flow_key key; int error; OVS_CB(skb)->input_vport = vport; OVS_CB(skb)->mru = 0; OVS_CB(skb)->cutlen = 0; OVS_CB(skb)->probability = 0; OVS_CB(skb)->upcall_pid = 0; if (unlikely(dev_net(skb->dev) != ovs_dp_get_net(vport->dp))) { u32 mark; mark = skb->mark; skb_scrub_packet(skb, true); skb->mark = mark; tun_info = NULL; } /* Extract flow from 'skb' into 'key'. */ error = ovs_flow_key_extract(tun_info, skb, &key); if (unlikely(error)) { kfree_skb(skb); return error; } ovs_dp_process_packet(skb, &key); return 0; } static int packet_length(const struct sk_buff *skb, struct net_device *dev) { int length = skb->len - dev->hard_header_len; if (!skb_vlan_tag_present(skb) && eth_type_vlan(skb->protocol)) length -= VLAN_HLEN; /* Don't subtract for multiple VLAN tags. Most (all?) drivers allow * (ETH_LEN + VLAN_HLEN) in addition to the mtu value, but almost none * account for 802.1ad. e.g. is_skb_forwardable(). */ return length > 0 ? length : 0; } void ovs_vport_send(struct vport *vport, struct sk_buff *skb, u8 mac_proto) { int mtu = vport->dev->mtu; switch (vport->dev->type) { case ARPHRD_NONE: if (mac_proto == MAC_PROTO_ETHERNET) { skb_reset_network_header(skb); skb_reset_mac_len(skb); skb->protocol = htons(ETH_P_TEB); } else if (mac_proto != MAC_PROTO_NONE) { WARN_ON_ONCE(1); goto drop; } break; case ARPHRD_ETHER: if (mac_proto != MAC_PROTO_ETHERNET) goto drop; break; default: goto drop; } if (unlikely(packet_length(skb, vport->dev) > mtu && !skb_is_gso(skb))) { vport->dev->stats.tx_errors++; if (vport->dev->flags & IFF_UP) net_warn_ratelimited("%s: dropped over-mtu packet: " "%d > %d\n", vport->dev->name, packet_length(skb, vport->dev), mtu); goto drop; } skb->dev = vport->dev; skb_clear_tstamp(skb); vport->ops->send(skb); return; drop: kfree_skb(skb); }
20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 // SPDX-License-Identifier: GPL-2.0-or-later /* X.509 certificate parser * * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define pr_fmt(fmt) "X.509: "fmt #include <linux/kernel.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/err.h> #include <linux/oid_registry.h> #include <crypto/public_key.h> #include "x509_parser.h" #include "x509.asn1.h" #include "x509_akid.asn1.h" struct x509_parse_context { struct x509_certificate *cert; /* Certificate being constructed */ unsigned long data; /* Start of data */ const void *key; /* Key data */ size_t key_size; /* Size of key data */ const void *params; /* Key parameters */ size_t params_size; /* Size of key parameters */ enum OID key_algo; /* Algorithm used by the cert's key */ enum OID last_oid; /* Last OID encountered */ enum OID sig_algo; /* Algorithm used to sign the cert */ u8 o_size; /* Size of organizationName (O) */ u8 cn_size; /* Size of commonName (CN) */ u8 email_size; /* Size of emailAddress */ u16 o_offset; /* Offset of organizationName (O) */ u16 cn_offset; /* Offset of commonName (CN) */ u16 email_offset; /* Offset of emailAddress */ unsigned raw_akid_size; const void *raw_akid; /* Raw authorityKeyId in ASN.1 */ const void *akid_raw_issuer; /* Raw directoryName in authorityKeyId */ unsigned akid_raw_issuer_size; }; /* * Free an X.509 certificate */ void x509_free_certificate(struct x509_certificate *cert) { if (cert) { public_key_free(cert->pub); public_key_signature_free(cert->sig); kfree(cert->issuer); kfree(cert->subject); kfree(cert->id); kfree(cert->skid); kfree(cert); } } EXPORT_SYMBOL_GPL(x509_free_certificate); /* * Parse an X.509 certificate */ struct x509_certificate *x509_cert_parse(const void *data, size_t datalen) { struct x509_certificate *cert __free(x509_free_certificate) = NULL; struct x509_parse_context *ctx __free(kfree) = NULL; struct asymmetric_key_id *kid; long ret; cert = kzalloc(sizeof(struct x509_certificate), GFP_KERNEL); if (!cert) return ERR_PTR(-ENOMEM); cert->pub = kzalloc(sizeof(struct public_key), GFP_KERNEL); if (!cert->pub) return ERR_PTR(-ENOMEM); cert->sig = kzalloc(sizeof(struct public_key_signature), GFP_KERNEL); if (!cert->sig) return ERR_PTR(-ENOMEM); ctx = kzalloc(sizeof(struct x509_parse_context), GFP_KERNEL); if (!ctx) return ERR_PTR(-ENOMEM); ctx->cert = cert; ctx->data = (unsigned long)data; /* Attempt to decode the certificate */ ret = asn1_ber_decoder(&x509_decoder, ctx, data, datalen); if (ret < 0) return ERR_PTR(ret); /* Decode the AuthorityKeyIdentifier */ if (ctx->raw_akid) { pr_devel("AKID: %u %*phN\n", ctx->raw_akid_size, ctx->raw_akid_size, ctx->raw_akid); ret = asn1_ber_decoder(&x509_akid_decoder, ctx, ctx->raw_akid, ctx->raw_akid_size); if (ret < 0) { pr_warn("Couldn't decode AuthKeyIdentifier\n"); return ERR_PTR(ret); } } cert->pub->key = kmemdup(ctx->key, ctx->key_size, GFP_KERNEL); if (!cert->pub->key) return ERR_PTR(-ENOMEM); cert->pub->keylen = ctx->key_size; cert->pub->params = kmemdup(ctx->params, ctx->params_size, GFP_KERNEL); if (!cert->pub->params) return ERR_PTR(-ENOMEM); cert->pub->paramlen = ctx->params_size; cert->pub->algo = ctx->key_algo; /* Grab the signature bits */ ret = x509_get_sig_params(cert); if (ret < 0) return ERR_PTR(ret); /* Generate cert issuer + serial number key ID */ kid = asymmetric_key_generate_id(cert->raw_serial, cert->raw_serial_size, cert->raw_issuer, cert->raw_issuer_size); if (IS_ERR(kid)) return ERR_CAST(kid); cert->id = kid; /* Detect self-signed certificates */ ret = x509_check_for_self_signed(cert); if (ret < 0) return ERR_PTR(ret); return_ptr(cert); } EXPORT_SYMBOL_GPL(x509_cert_parse); /* * Note an OID when we find one for later processing when we know how * to interpret it. */ int x509_note_OID(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct x509_parse_context *ctx = context; ctx->last_oid = look_up_OID(value, vlen); if (ctx->last_oid == OID__NR) { char buffer[50]; sprint_oid(value, vlen, buffer, sizeof(buffer)); pr_debug("Unknown OID: [%lu] %s\n", (unsigned long)value - ctx->data, buffer); } return 0; } /* * Save the position of the TBS data so that we can check the signature over it * later. */ int x509_note_tbs_certificate(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct x509_parse_context *ctx = context; pr_debug("x509_note_tbs_certificate(,%zu,%02x,%ld,%zu)!\n", hdrlen, tag, (unsigned long)value - ctx->data, vlen); ctx->cert->tbs = value - hdrlen; ctx->cert->tbs_size = vlen + hdrlen; return 0; } /* * Record the algorithm that was used to sign this certificate. */ int x509_note_sig_algo(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct x509_parse_context *ctx = context; pr_debug("PubKey Algo: %u\n", ctx->last_oid); switch (ctx->last_oid) { default: return -ENOPKG; /* Unsupported combination */ case OID_sha1WithRSAEncryption: ctx->cert->sig->hash_algo = "sha1"; goto rsa_pkcs1; case OID_sha256WithRSAEncryption: ctx->cert->sig->hash_algo = "sha256"; goto rsa_pkcs1; case OID_sha384WithRSAEncryption: ctx->cert->sig->hash_algo = "sha384"; goto rsa_pkcs1; case OID_sha512WithRSAEncryption: ctx->cert->sig->hash_algo = "sha512"; goto rsa_pkcs1; case OID_sha224WithRSAEncryption: ctx->cert->sig->hash_algo = "sha224"; goto rsa_pkcs1; case OID_id_ecdsa_with_sha1: ctx->cert->sig->hash_algo = "sha1"; goto ecdsa; case OID_id_rsassa_pkcs1_v1_5_with_sha3_256: ctx->cert->sig->hash_algo = "sha3-256"; goto rsa_pkcs1; case OID_id_rsassa_pkcs1_v1_5_with_sha3_384: ctx->cert->sig->hash_algo = "sha3-384"; goto rsa_pkcs1; case OID_id_rsassa_pkcs1_v1_5_with_sha3_512: ctx->cert->sig->hash_algo = "sha3-512"; goto rsa_pkcs1; case OID_id_ecdsa_with_sha224: ctx->cert->sig->hash_algo = "sha224"; goto ecdsa; case OID_id_ecdsa_with_sha256: ctx->cert->sig->hash_algo = "sha256"; goto ecdsa; case OID_id_ecdsa_with_sha384: ctx->cert->sig->hash_algo = "sha384"; goto ecdsa; case OID_id_ecdsa_with_sha512: ctx->cert->sig->hash_algo = "sha512"; goto ecdsa; case OID_id_ecdsa_with_sha3_256: ctx->cert->sig->hash_algo = "sha3-256"; goto ecdsa; case OID_id_ecdsa_with_sha3_384: ctx->cert->sig->hash_algo = "sha3-384"; goto ecdsa; case OID_id_ecdsa_with_sha3_512: ctx->cert->sig->hash_algo = "sha3-512"; goto ecdsa; case OID_gost2012Signature256: ctx->cert->sig->hash_algo = "streebog256"; goto ecrdsa; case OID_gost2012Signature512: ctx->cert->sig->hash_algo = "streebog512"; goto ecrdsa; } rsa_pkcs1: ctx->cert->sig->pkey_algo = "rsa"; ctx->cert->sig->encoding = "pkcs1"; ctx->sig_algo = ctx->last_oid; return 0; ecrdsa: ctx->cert->sig->pkey_algo = "ecrdsa"; ctx->cert->sig->encoding = "raw"; ctx->sig_algo = ctx->last_oid; return 0; ecdsa: ctx->cert->sig->pkey_algo = "ecdsa"; ctx->cert->sig->encoding = "x962"; ctx->sig_algo = ctx->last_oid; return 0; } /* * Note the whereabouts and type of the signature. */ int x509_note_signature(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct x509_parse_context *ctx = context; pr_debug("Signature: alg=%u, size=%zu\n", ctx->last_oid, vlen); /* * In X.509 certificates, the signature's algorithm is stored in two * places: inside the TBSCertificate (the data that is signed), and * alongside the signature. These *must* match. */ if (ctx->last_oid != ctx->sig_algo) { pr_warn("signatureAlgorithm (%u) differs from tbsCertificate.signature (%u)\n", ctx->last_oid, ctx->sig_algo); return -EINVAL; } if (strcmp(ctx->cert->sig->pkey_algo, "rsa") == 0 || strcmp(ctx->cert->sig->pkey_algo, "ecrdsa") == 0 || strcmp(ctx->cert->sig->pkey_algo, "ecdsa") == 0) { /* Discard the BIT STRING metadata */ if (vlen < 1 || *(const u8 *)value != 0) return -EBADMSG; value++; vlen--; } ctx->cert->raw_sig = value; ctx->cert->raw_sig_size = vlen; return 0; } /* * Note the certificate serial number */ int x509_note_serial(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct x509_parse_context *ctx = context; ctx->cert->raw_serial = value; ctx->cert->raw_serial_size = vlen; return 0; } /* * Note some of the name segments from which we'll fabricate a name. */ int x509_extract_name_segment(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct x509_parse_context *ctx = context; switch (ctx->last_oid) { case OID_commonName: ctx->cn_size = vlen; ctx->cn_offset = (unsigned long)value - ctx->data; break; case OID_organizationName: ctx->o_size = vlen; ctx->o_offset = (unsigned long)value - ctx->data; break; case OID_email_address: ctx->email_size = vlen; ctx->email_offset = (unsigned long)value - ctx->data; break; default: break; } return 0; } /* * Fabricate and save the issuer and subject names */ static int x509_fabricate_name(struct x509_parse_context *ctx, size_t hdrlen, unsigned char tag, char **_name, size_t vlen) { const void *name, *data = (const void *)ctx->data; size_t namesize; char *buffer; if (*_name) return -EINVAL; /* Empty name string if no material */ if (!ctx->cn_size && !ctx->o_size && !ctx->email_size) { buffer = kzalloc(1, GFP_KERNEL); if (!buffer) return -ENOMEM; goto done; } if (ctx->cn_size && ctx->o_size) { /* Consider combining O and CN, but use only the CN if it is * prefixed by the O, or a significant portion thereof. */ namesize = ctx->cn_size; name = data + ctx->cn_offset; if (ctx->cn_size >= ctx->o_size && memcmp(data + ctx->cn_offset, data + ctx->o_offset, ctx->o_size) == 0) goto single_component; if (ctx->cn_size >= 7 && ctx->o_size >= 7 && memcmp(data + ctx->cn_offset, data + ctx->o_offset, 7) == 0) goto single_component; buffer = kmalloc(ctx->o_size + 2 + ctx->cn_size + 1, GFP_KERNEL); if (!buffer) return -ENOMEM; memcpy(buffer, data + ctx->o_offset, ctx->o_size); buffer[ctx->o_size + 0] = ':'; buffer[ctx->o_size + 1] = ' '; memcpy(buffer + ctx->o_size + 2, data + ctx->cn_offset, ctx->cn_size); buffer[ctx->o_size + 2 + ctx->cn_size] = 0; goto done; } else if (ctx->cn_size) { namesize = ctx->cn_size; name = data + ctx->cn_offset; } else if (ctx->o_size) { namesize = ctx->o_size; name = data + ctx->o_offset; } else { namesize = ctx->email_size; name = data + ctx->email_offset; } single_component: buffer = kmalloc(namesize + 1, GFP_KERNEL); if (!buffer) return -ENOMEM; memcpy(buffer, name, namesize); buffer[namesize] = 0; done: *_name = buffer; ctx->cn_size = 0; ctx->o_size = 0; ctx->email_size = 0; return 0; } int x509_note_issuer(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct x509_parse_context *ctx = context; struct asymmetric_key_id *kid; ctx->cert->raw_issuer = value; ctx->cert->raw_issuer_size = vlen; if (!ctx->cert->sig->auth_ids[2]) { kid = asymmetric_key_generate_id(value, vlen, "", 0); if (IS_ERR(kid)) return PTR_ERR(kid); ctx->cert->sig->auth_ids[2] = kid; } return x509_fabricate_name(ctx, hdrlen, tag, &ctx->cert->issuer, vlen); } int x509_note_subject(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct x509_parse_context *ctx = context; ctx->cert->raw_subject = value; ctx->cert->raw_subject_size = vlen; return x509_fabricate_name(ctx, hdrlen, tag, &ctx->cert->subject, vlen); } /* * Extract the parameters for the public key */ int x509_note_params(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct x509_parse_context *ctx = context; /* * AlgorithmIdentifier is used three times in the x509, we should skip * first and ignore third, using second one which is after subject and * before subjectPublicKey. */ if (!ctx->cert->raw_subject || ctx->key) return 0; ctx->params = value - hdrlen; ctx->params_size = vlen + hdrlen; return 0; } /* * Extract the data for the public key algorithm */ int x509_extract_key_data(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct x509_parse_context *ctx = context; enum OID oid; ctx->key_algo = ctx->last_oid; switch (ctx->last_oid) { case OID_rsaEncryption: ctx->cert->pub->pkey_algo = "rsa"; break; case OID_gost2012PKey256: case OID_gost2012PKey512: ctx->cert->pub->pkey_algo = "ecrdsa"; break; case OID_id_ecPublicKey: if (parse_OID(ctx->params, ctx->params_size, &oid) != 0) return -EBADMSG; switch (oid) { case OID_id_prime192v1: ctx->cert->pub->pkey_algo = "ecdsa-nist-p192"; break; case OID_id_prime256v1: ctx->cert->pub->pkey_algo = "ecdsa-nist-p256"; break; case OID_id_ansip384r1: ctx->cert->pub->pkey_algo = "ecdsa-nist-p384"; break; case OID_id_ansip521r1: ctx->cert->pub->pkey_algo = "ecdsa-nist-p521"; break; default: return -ENOPKG; } break; default: return -ENOPKG; } /* Discard the BIT STRING metadata */ if (vlen < 1 || *(const u8 *)value != 0) return -EBADMSG; ctx->key = value + 1; ctx->key_size = vlen - 1; return 0; } /* The keyIdentifier in AuthorityKeyIdentifier SEQUENCE is tag(CONT,PRIM,0) */ #define SEQ_TAG_KEYID (ASN1_CONT << 6) /* * Process certificate extensions that are used to qualify the certificate. */ int x509_process_extension(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct x509_parse_context *ctx = context; struct asymmetric_key_id *kid; const unsigned char *v = value; pr_debug("Extension: %u\n", ctx->last_oid); if (ctx->last_oid == OID_subjectKeyIdentifier) { /* Get hold of the key fingerprint */ if (ctx->cert->skid || vlen < 3) return -EBADMSG; if (v[0] != ASN1_OTS || v[1] != vlen - 2) return -EBADMSG; v += 2; vlen -= 2; ctx->cert->raw_skid_size = vlen; ctx->cert->raw_skid = v; kid = asymmetric_key_generate_id(v, vlen, "", 0); if (IS_ERR(kid)) return PTR_ERR(kid); ctx->cert->skid = kid; pr_debug("subjkeyid %*phN\n", kid->len, kid->data); return 0; } if (ctx->last_oid == OID_keyUsage) { /* * Get hold of the keyUsage bit string * v[1] is the encoding size * (Expect either 0x02 or 0x03, making it 1 or 2 bytes) * v[2] is the number of unused bits in the bit string * (If >= 3 keyCertSign is missing when v[1] = 0x02) * v[3] and possibly v[4] contain the bit string * * From RFC 5280 4.2.1.3: * 0x04 is where keyCertSign lands in this bit string * 0x80 is where digitalSignature lands in this bit string */ if (v[0] != ASN1_BTS) return -EBADMSG; if (vlen < 4) return -EBADMSG; if (v[2] >= 8) return -EBADMSG; if (v[3] & 0x80) ctx->cert->pub->key_eflags |= 1 << KEY_EFLAG_DIGITALSIG; if (v[1] == 0x02 && v[2] <= 2 && (v[3] & 0x04)) ctx->cert->pub->key_eflags |= 1 << KEY_EFLAG_KEYCERTSIGN; else if (vlen > 4 && v[1] == 0x03 && (v[3] & 0x04)) ctx->cert->pub->key_eflags |= 1 << KEY_EFLAG_KEYCERTSIGN; return 0; } if (ctx->last_oid == OID_authorityKeyIdentifier) { /* Get hold of the CA key fingerprint */ ctx->raw_akid = v; ctx->raw_akid_size = vlen; return 0; } if (ctx->last_oid == OID_basicConstraints) { /* * Get hold of the basicConstraints * v[1] is the encoding size * (Expect 0x00 for empty SEQUENCE with CA:FALSE, or * 0x03 or greater for non-empty SEQUENCE) * v[2] is the encoding type * (Expect an ASN1_BOOL for the CA) * v[3] is the length of the ASN1_BOOL * (Expect 1 for a single byte boolean) * v[4] is the contents of the ASN1_BOOL * (Expect 0xFF if the CA is TRUE) * vlen should match the entire extension size */ if (v[0] != (ASN1_CONS_BIT | ASN1_SEQ)) return -EBADMSG; if (vlen < 2) return -EBADMSG; if (v[1] != vlen - 2) return -EBADMSG; /* Empty SEQUENCE means CA:FALSE (default value omitted per DER) */ if (v[1] == 0) return 0; if (vlen >= 5 && v[2] == ASN1_BOOL && v[3] == 1 && v[4] == 0xFF) ctx->cert->pub->key_eflags |= 1 << KEY_EFLAG_CA; else return -EBADMSG; return 0; } return 0; } /** * x509_decode_time - Decode an X.509 time ASN.1 object * @_t: The time to fill in * @hdrlen: The length of the object header * @tag: The object tag * @value: The object value * @vlen: The size of the object value * * Decode an ASN.1 universal time or generalised time field into a struct the * kernel can handle and check it for validity. The time is decoded thus: * * [RFC5280 §4.1.2.5] * CAs conforming to this profile MUST always encode certificate validity * dates through the year 2049 as UTCTime; certificate validity dates in * 2050 or later MUST be encoded as GeneralizedTime. Conforming * applications MUST be able to process validity dates that are encoded in * either UTCTime or GeneralizedTime. */ int x509_decode_time(time64_t *_t, size_t hdrlen, unsigned char tag, const unsigned char *value, size_t vlen) { static const unsigned char month_lengths[] = { 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 }; const unsigned char *p = value; unsigned year, mon, day, hour, min, sec, mon_len; #define dec2bin(X) ({ unsigned char x = (X) - '0'; if (x > 9) goto invalid_time; x; }) #define DD2bin(P) ({ unsigned x = dec2bin(P[0]) * 10 + dec2bin(P[1]); P += 2; x; }) if (tag == ASN1_UNITIM) { /* UTCTime: YYMMDDHHMMSSZ */ if (vlen != 13) goto unsupported_time; year = DD2bin(p); if (year >= 50) year += 1900; else year += 2000; } else if (tag == ASN1_GENTIM) { /* GenTime: YYYYMMDDHHMMSSZ */ if (vlen != 15) goto unsupported_time; year = DD2bin(p) * 100 + DD2bin(p); if (year >= 1950 && year <= 2049) goto invalid_time; } else { goto unsupported_time; } mon = DD2bin(p); day = DD2bin(p); hour = DD2bin(p); min = DD2bin(p); sec = DD2bin(p); if (*p != 'Z') goto unsupported_time; if (year < 1970 || mon < 1 || mon > 12) goto invalid_time; mon_len = month_lengths[mon - 1]; if (mon == 2) { if (year % 4 == 0) { mon_len = 29; if (year % 100 == 0) { mon_len = 28; if (year % 400 == 0) mon_len = 29; } } } if (day < 1 || day > mon_len || hour > 24 || /* ISO 8601 permits 24:00:00 as midnight tomorrow */ min > 59 || sec > 60) /* ISO 8601 permits leap seconds [X.680 46.3] */ goto invalid_time; *_t = mktime64(year, mon, day, hour, min, sec); return 0; unsupported_time: pr_debug("Got unsupported time [tag %02x]: '%*phN'\n", tag, (int)vlen, value); return -EBADMSG; invalid_time: pr_debug("Got invalid time [tag %02x]: '%*phN'\n", tag, (int)vlen, value); return -EBADMSG; } EXPORT_SYMBOL_GPL(x509_decode_time); int x509_note_not_before(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct x509_parse_context *ctx = context; return x509_decode_time(&ctx->cert->valid_from, hdrlen, tag, value, vlen); } int x509_note_not_after(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct x509_parse_context *ctx = context; return x509_decode_time(&ctx->cert->valid_to, hdrlen, tag, value, vlen); } /* * Note a key identifier-based AuthorityKeyIdentifier */ int x509_akid_note_kid(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct x509_parse_context *ctx = context; struct asymmetric_key_id *kid; pr_debug("AKID: keyid: %*phN\n", (int)vlen, value); if (ctx->cert->sig->auth_ids[1]) return 0; kid = asymmetric_key_generate_id(value, vlen, "", 0); if (IS_ERR(kid)) return PTR_ERR(kid); pr_debug("authkeyid %*phN\n", kid->len, kid->data); ctx->cert->sig->auth_ids[1] = kid; return 0; } /* * Note a directoryName in an AuthorityKeyIdentifier */ int x509_akid_note_name(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct x509_parse_context *ctx = context; pr_debug("AKID: name: %*phN\n", (int)vlen, value); ctx->akid_raw_issuer = value; ctx->akid_raw_issuer_size = vlen; return 0; } /* * Note a serial number in an AuthorityKeyIdentifier */ int x509_akid_note_serial(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct x509_parse_context *ctx = context; struct asymmetric_key_id *kid; pr_debug("AKID: serial: %*phN\n", (int)vlen, value); if (!ctx->akid_raw_issuer || ctx->cert->sig->auth_ids[0]) return 0; kid = asymmetric_key_generate_id(value, vlen, ctx->akid_raw_issuer, ctx->akid_raw_issuer_size); if (IS_ERR(kid)) return PTR_ERR(kid); pr_debug("authkeyid %*phN\n", kid->len, kid->data); ctx->cert->sig->auth_ids[0] = kid; return 0; }
3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ #include <linux/types.h> #include <linux/bpf.h> #include <linux/bpf_local_storage.h> #include <uapi/linux/btf.h> #include <linux/btf_ids.h> DEFINE_BPF_STORAGE_CACHE(cgroup_cache); static DEFINE_PER_CPU(int, bpf_cgrp_storage_busy); static void bpf_cgrp_storage_lock(void) { cant_migrate(); this_cpu_inc(bpf_cgrp_storage_busy); } static void bpf_cgrp_storage_unlock(void) { this_cpu_dec(bpf_cgrp_storage_busy); } static bool bpf_cgrp_storage_trylock(void) { cant_migrate(); if (unlikely(this_cpu_inc_return(bpf_cgrp_storage_busy) != 1)) { this_cpu_dec(bpf_cgrp_storage_busy); return false; } return true; } static struct bpf_local_storage __rcu **cgroup_storage_ptr(void *owner) { struct cgroup *cg = owner; return &cg->bpf_cgrp_storage; } void bpf_cgrp_storage_free(struct cgroup *cgroup) { struct bpf_local_storage *local_storage; rcu_read_lock_dont_migrate(); local_storage = rcu_dereference(cgroup->bpf_cgrp_storage); if (!local_storage) goto out; bpf_cgrp_storage_lock(); bpf_local_storage_destroy(local_storage); bpf_cgrp_storage_unlock(); out: rcu_read_unlock_migrate(); } static struct bpf_local_storage_data * cgroup_storage_lookup(struct cgroup *cgroup, struct bpf_map *map, bool cacheit_lockit) { struct bpf_local_storage *cgroup_storage; struct bpf_local_storage_map *smap; cgroup_storage = rcu_dereference_check(cgroup->bpf_cgrp_storage, bpf_rcu_lock_held()); if (!cgroup_storage) return NULL; smap = (struct bpf_local_storage_map *)map; return bpf_local_storage_lookup(cgroup_storage, smap, cacheit_lockit); } static void *bpf_cgrp_storage_lookup_elem(struct bpf_map *map, void *key) { struct bpf_local_storage_data *sdata; struct cgroup *cgroup; int fd; fd = *(int *)key; cgroup = cgroup_v1v2_get_from_fd(fd); if (IS_ERR(cgroup)) return ERR_CAST(cgroup); bpf_cgrp_storage_lock(); sdata = cgroup_storage_lookup(cgroup, map, true); bpf_cgrp_storage_unlock(); cgroup_put(cgroup); return sdata ? sdata->data : NULL; } static long bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_local_storage_data *sdata; struct cgroup *cgroup; int fd; fd = *(int *)key; cgroup = cgroup_v1v2_get_from_fd(fd); if (IS_ERR(cgroup)) return PTR_ERR(cgroup); bpf_cgrp_storage_lock(); sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map, value, map_flags, false, GFP_ATOMIC); bpf_cgrp_storage_unlock(); cgroup_put(cgroup); return PTR_ERR_OR_ZERO(sdata); } static int cgroup_storage_delete(struct cgroup *cgroup, struct bpf_map *map) { struct bpf_local_storage_data *sdata; sdata = cgroup_storage_lookup(cgroup, map, false); if (!sdata) return -ENOENT; bpf_selem_unlink(SELEM(sdata), false); return 0; } static long bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key) { struct cgroup *cgroup; int err, fd; fd = *(int *)key; cgroup = cgroup_v1v2_get_from_fd(fd); if (IS_ERR(cgroup)) return PTR_ERR(cgroup); bpf_cgrp_storage_lock(); err = cgroup_storage_delete(cgroup, map); bpf_cgrp_storage_unlock(); cgroup_put(cgroup); return err; } static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key) { return -ENOTSUPP; } static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) { return bpf_local_storage_map_alloc(attr, &cgroup_cache, true); } static void cgroup_storage_map_free(struct bpf_map *map) { bpf_local_storage_map_free(map, &cgroup_cache, &bpf_cgrp_storage_busy); } /* *gfp_flags* is a hidden argument provided by the verifier */ BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup, void *, value, u64, flags, gfp_t, gfp_flags) { struct bpf_local_storage_data *sdata; bool nobusy; WARN_ON_ONCE(!bpf_rcu_lock_held()); if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE)) return (unsigned long)NULL; if (!cgroup) return (unsigned long)NULL; nobusy = bpf_cgrp_storage_trylock(); sdata = cgroup_storage_lookup(cgroup, map, nobusy); if (sdata) goto unlock; /* only allocate new storage, when the cgroup is refcounted */ if (!percpu_ref_is_dying(&cgroup->self.refcnt) && (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) && nobusy) sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map, value, BPF_NOEXIST, false, gfp_flags); unlock: if (nobusy) bpf_cgrp_storage_unlock(); return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data; } BPF_CALL_2(bpf_cgrp_storage_delete, struct bpf_map *, map, struct cgroup *, cgroup) { int ret; WARN_ON_ONCE(!bpf_rcu_lock_held()); if (!cgroup) return -EINVAL; if (!bpf_cgrp_storage_trylock()) return -EBUSY; ret = cgroup_storage_delete(cgroup, map); bpf_cgrp_storage_unlock(); return ret; } const struct bpf_map_ops cgrp_storage_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = bpf_local_storage_map_alloc_check, .map_alloc = cgroup_storage_map_alloc, .map_free = cgroup_storage_map_free, .map_get_next_key = notsupp_get_next_key, .map_lookup_elem = bpf_cgrp_storage_lookup_elem, .map_update_elem = bpf_cgrp_storage_update_elem, .map_delete_elem = bpf_cgrp_storage_delete_elem, .map_check_btf = bpf_local_storage_map_check_btf, .map_mem_usage = bpf_local_storage_map_mem_usage, .map_btf_id = &bpf_local_storage_map_btf_id[0], .map_owner_storage_ptr = cgroup_storage_ptr, }; const struct bpf_func_proto bpf_cgrp_storage_get_proto = { .func = bpf_cgrp_storage_get, .gpl_only = false, .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL, .arg2_btf_id = &bpf_cgroup_btf_id[0], .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, .arg4_type = ARG_ANYTHING, }; const struct bpf_func_proto bpf_cgrp_storage_delete_proto = { .func = bpf_cgrp_storage_delete, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL, .arg2_btf_id = &bpf_cgroup_btf_id[0], };
3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 /* * netfilter module to limit the number of parallel tcp * connections per IP address. * (c) 2000 Gerd Knorr <kraxel@bytesex.org> * Nov 2002: Martin Bene <martin.bene@icomedias.com>: * only ignore TIME_WAIT or gone connections * (C) CC Computer Consultants GmbH, 2007 * * based on ... * * Kernel module to match connection tracking information. * GPL (C) 1999 Rusty Russell (rusty@rustcorp.com.au). */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_connlimit.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_tuple.h> #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_conntrack_count.h> static bool connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) { struct net *net = xt_net(par); const struct xt_connlimit_info *info = par->matchinfo; const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; enum ip_conntrack_info ctinfo; const struct nf_conn *ct; unsigned int connections; u32 key[5]; ct = nf_ct_get(skb, &ctinfo); if (ct) zone = nf_ct_zone(ct); if (xt_family(par) == NFPROTO_IPV6) { const struct ipv6hdr *iph = ipv6_hdr(skb); union nf_inet_addr addr; unsigned int i; memcpy(&addr.ip6, (info->flags & XT_CONNLIMIT_DADDR) ? &iph->daddr : &iph->saddr, sizeof(addr.ip6)); for (i = 0; i < ARRAY_SIZE(addr.ip6); ++i) addr.ip6[i] &= info->mask.ip6[i]; memcpy(key, &addr, sizeof(addr.ip6)); key[4] = zone->id; } else { const struct iphdr *iph = ip_hdr(skb); key[0] = (info->flags & XT_CONNLIMIT_DADDR) ? (__force __u32)iph->daddr : (__force __u32)iph->saddr; key[0] &= (__force __u32)info->mask.ip; key[1] = zone->id; } connections = nf_conncount_count_skb(net, skb, xt_family(par), info->data, key); if (connections == 0) /* kmalloc failed or tuple couldn't be found, drop it entirely */ goto hotdrop; return (connections > info->limit) ^ !!(info->flags & XT_CONNLIMIT_INVERT); hotdrop: par->hotdrop = true; return false; } static int connlimit_mt_check(const struct xt_mtchk_param *par) { struct xt_connlimit_info *info = par->matchinfo; unsigned int keylen; int ret; keylen = sizeof(u32); if (par->family == NFPROTO_IPV6) keylen += sizeof(struct in6_addr); else keylen += sizeof(struct in_addr); ret = nf_ct_netns_get(par->net, par->family); if (ret < 0) { pr_info_ratelimited("cannot load conntrack support for proto=%u\n", par->family); return ret; } /* init private data */ info->data = nf_conncount_init(par->net, keylen); if (IS_ERR(info->data)) nf_ct_netns_put(par->net, par->family); return PTR_ERR_OR_ZERO(info->data); } static void connlimit_mt_destroy(const struct xt_mtdtor_param *par) { const struct xt_connlimit_info *info = par->matchinfo; nf_conncount_destroy(par->net, info->data); nf_ct_netns_put(par->net, par->family); } static struct xt_match connlimit_mt_reg[] __read_mostly = { { .name = "connlimit", .revision = 1, .family = NFPROTO_IPV4, .checkentry = connlimit_mt_check, .match = connlimit_mt, .matchsize = sizeof(struct xt_connlimit_info), .usersize = offsetof(struct xt_connlimit_info, data), .destroy = connlimit_mt_destroy, .me = THIS_MODULE, }, #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { .name = "connlimit", .revision = 1, .family = NFPROTO_IPV6, .checkentry = connlimit_mt_check, .match = connlimit_mt, .matchsize = sizeof(struct xt_connlimit_info), .usersize = offsetof(struct xt_connlimit_info, data), .destroy = connlimit_mt_destroy, .me = THIS_MODULE, }, #endif }; static int __init connlimit_mt_init(void) { return xt_register_matches(connlimit_mt_reg, ARRAY_SIZE(connlimit_mt_reg)); } static void __exit connlimit_mt_exit(void) { xt_unregister_matches(connlimit_mt_reg, ARRAY_SIZE(connlimit_mt_reg)); } module_init(connlimit_mt_init); module_exit(connlimit_mt_exit); MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>"); MODULE_DESCRIPTION("Xtables: Number of connections matching"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_connlimit"); MODULE_ALIAS("ip6t_connlimit");
9 2 9 7 9 2 9 2 9 9 2 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 // SPDX-License-Identifier: GPL-2.0-only #include <net/xdp_sock_drv.h> #include "netlink.h" #include "common.h" struct channels_req_info { struct ethnl_req_info base; }; struct channels_reply_data { struct ethnl_reply_data base; struct ethtool_channels channels; }; #define CHANNELS_REPDATA(__reply_base) \ container_of(__reply_base, struct channels_reply_data, base) const struct nla_policy ethnl_channels_get_policy[] = { [ETHTOOL_A_CHANNELS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), }; static int channels_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct channels_reply_data *data = CHANNELS_REPDATA(reply_base); struct net_device *dev = reply_base->dev; int ret; if (!dev->ethtool_ops->get_channels) return -EOPNOTSUPP; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; dev->ethtool_ops->get_channels(dev, &data->channels); ethnl_ops_complete(dev); return 0; } static int channels_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { return nla_total_size(sizeof(u32)) + /* _CHANNELS_RX_MAX */ nla_total_size(sizeof(u32)) + /* _CHANNELS_TX_MAX */ nla_total_size(sizeof(u32)) + /* _CHANNELS_OTHER_MAX */ nla_total_size(sizeof(u32)) + /* _CHANNELS_COMBINED_MAX */ nla_total_size(sizeof(u32)) + /* _CHANNELS_RX_COUNT */ nla_total_size(sizeof(u32)) + /* _CHANNELS_TX_COUNT */ nla_total_size(sizeof(u32)) + /* _CHANNELS_OTHER_COUNT */ nla_total_size(sizeof(u32)); /* _CHANNELS_COMBINED_COUNT */ } static int channels_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct channels_reply_data *data = CHANNELS_REPDATA(reply_base); const struct ethtool_channels *channels = &data->channels; if ((channels->max_rx && (nla_put_u32(skb, ETHTOOL_A_CHANNELS_RX_MAX, channels->max_rx) || nla_put_u32(skb, ETHTOOL_A_CHANNELS_RX_COUNT, channels->rx_count))) || (channels->max_tx && (nla_put_u32(skb, ETHTOOL_A_CHANNELS_TX_MAX, channels->max_tx) || nla_put_u32(skb, ETHTOOL_A_CHANNELS_TX_COUNT, channels->tx_count))) || (channels->max_other && (nla_put_u32(skb, ETHTOOL_A_CHANNELS_OTHER_MAX, channels->max_other) || nla_put_u32(skb, ETHTOOL_A_CHANNELS_OTHER_COUNT, channels->other_count))) || (channels->max_combined && (nla_put_u32(skb, ETHTOOL_A_CHANNELS_COMBINED_MAX, channels->max_combined) || nla_put_u32(skb, ETHTOOL_A_CHANNELS_COMBINED_COUNT, channels->combined_count)))) return -EMSGSIZE; return 0; } /* CHANNELS_SET */ const struct nla_policy ethnl_channels_set_policy[] = { [ETHTOOL_A_CHANNELS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_CHANNELS_RX_COUNT] = { .type = NLA_U32 }, [ETHTOOL_A_CHANNELS_TX_COUNT] = { .type = NLA_U32 }, [ETHTOOL_A_CHANNELS_OTHER_COUNT] = { .type = NLA_U32 }, [ETHTOOL_A_CHANNELS_COMBINED_COUNT] = { .type = NLA_U32 }, }; static int ethnl_set_channels_validate(struct ethnl_req_info *req_info, struct genl_info *info) { const struct ethtool_ops *ops = req_info->dev->ethtool_ops; return ops->get_channels && ops->set_channels ? 1 : -EOPNOTSUPP; } static int ethnl_set_channels(struct ethnl_req_info *req_info, struct genl_info *info) { unsigned int from_channel, old_total, i; bool mod = false, mod_combined = false; struct net_device *dev = req_info->dev; struct ethtool_channels channels = {}; struct nlattr **tb = info->attrs; u32 err_attr; int ret; dev->ethtool_ops->get_channels(dev, &channels); old_total = channels.combined_count + max(channels.rx_count, channels.tx_count); ethnl_update_u32(&channels.rx_count, tb[ETHTOOL_A_CHANNELS_RX_COUNT], &mod); ethnl_update_u32(&channels.tx_count, tb[ETHTOOL_A_CHANNELS_TX_COUNT], &mod); ethnl_update_u32(&channels.other_count, tb[ETHTOOL_A_CHANNELS_OTHER_COUNT], &mod); ethnl_update_u32(&channels.combined_count, tb[ETHTOOL_A_CHANNELS_COMBINED_COUNT], &mod_combined); mod |= mod_combined; if (!mod) return 0; /* ensure new channel counts are within limits */ if (channels.rx_count > channels.max_rx) err_attr = ETHTOOL_A_CHANNELS_RX_COUNT; else if (channels.tx_count > channels.max_tx) err_attr = ETHTOOL_A_CHANNELS_TX_COUNT; else if (channels.other_count > channels.max_other) err_attr = ETHTOOL_A_CHANNELS_OTHER_COUNT; else if (channels.combined_count > channels.max_combined) err_attr = ETHTOOL_A_CHANNELS_COMBINED_COUNT; else err_attr = 0; if (err_attr) { NL_SET_ERR_MSG_ATTR(info->extack, tb[err_attr], "requested channel count exceeds maximum"); return -EINVAL; } /* ensure there is at least one RX and one TX channel */ if (!channels.combined_count && !channels.rx_count) err_attr = ETHTOOL_A_CHANNELS_RX_COUNT; else if (!channels.combined_count && !channels.tx_count) err_attr = ETHTOOL_A_CHANNELS_TX_COUNT; else err_attr = 0; if (err_attr) { if (mod_combined) err_attr = ETHTOOL_A_CHANNELS_COMBINED_COUNT; NL_SET_ERR_MSG_ATTR(info->extack, tb[err_attr], "requested channel counts would result in no RX or TX channel being configured"); return -EINVAL; } ret = ethtool_check_max_channel(dev, channels, info); if (ret) return ret; /* Disabling channels, query zero-copy AF_XDP sockets */ from_channel = channels.combined_count + min(channels.rx_count, channels.tx_count); for (i = from_channel; i < old_total; i++) if (xsk_get_pool_from_qid(dev, i)) { GENL_SET_ERR_MSG(info, "requested channel counts are too low for existing zerocopy AF_XDP sockets"); return -EINVAL; } ret = dev->ethtool_ops->set_channels(dev, &channels); return ret < 0 ? ret : 1; } const struct ethnl_request_ops ethnl_channels_request_ops = { .request_cmd = ETHTOOL_MSG_CHANNELS_GET, .reply_cmd = ETHTOOL_MSG_CHANNELS_GET_REPLY, .hdr_attr = ETHTOOL_A_CHANNELS_HEADER, .req_info_size = sizeof(struct channels_req_info), .reply_data_size = sizeof(struct channels_reply_data), .prepare_data = channels_prepare_data, .reply_size = channels_reply_size, .fill_reply = channels_fill_reply, .set_validate = ethnl_set_channels_validate, .set = ethnl_set_channels, .set_ntf_cmd = ETHTOOL_MSG_CHANNELS_NTF, };
24 24 23 1 20 20 1 24 1 2 21 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 // SPDX-License-Identifier: GPL-2.0-only /* * net/sched/sch_mqprio.c * * Copyright (c) 2010 John Fastabend <john.r.fastabend@intel.com> */ #include <linux/ethtool_netlink.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/module.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/sch_generic.h> #include <net/pkt_cls.h> #include "sch_mqprio_lib.h" struct mqprio_sched { struct Qdisc **qdiscs; u16 mode; u16 shaper; int hw_offload; u32 flags; u64 min_rate[TC_QOPT_MAX_QUEUE]; u64 max_rate[TC_QOPT_MAX_QUEUE]; u32 fp[TC_QOPT_MAX_QUEUE]; }; static int mqprio_enable_offload(struct Qdisc *sch, const struct tc_mqprio_qopt *qopt, struct netlink_ext_ack *extack) { struct mqprio_sched *priv = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct tc_mqprio_qopt_offload mqprio = { .qopt = *qopt, .extack = extack, }; int err, i; switch (priv->mode) { case TC_MQPRIO_MODE_DCB: if (priv->shaper != TC_MQPRIO_SHAPER_DCB) return -EINVAL; break; case TC_MQPRIO_MODE_CHANNEL: mqprio.flags = priv->flags; if (priv->flags & TC_MQPRIO_F_MODE) mqprio.mode = priv->mode; if (priv->flags & TC_MQPRIO_F_SHAPER) mqprio.shaper = priv->shaper; if (priv->flags & TC_MQPRIO_F_MIN_RATE) for (i = 0; i < mqprio.qopt.num_tc; i++) mqprio.min_rate[i] = priv->min_rate[i]; if (priv->flags & TC_MQPRIO_F_MAX_RATE) for (i = 0; i < mqprio.qopt.num_tc; i++) mqprio.max_rate[i] = priv->max_rate[i]; break; default: return -EINVAL; } mqprio_fp_to_offload(priv->fp, &mqprio); err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQPRIO, &mqprio); if (err) return err; priv->hw_offload = mqprio.qopt.hw; return 0; } static void mqprio_disable_offload(struct Qdisc *sch) { struct tc_mqprio_qopt_offload mqprio = { { 0 } }; struct mqprio_sched *priv = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); switch (priv->mode) { case TC_MQPRIO_MODE_DCB: case TC_MQPRIO_MODE_CHANNEL: dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQPRIO, &mqprio); break; } } static void mqprio_destroy(struct Qdisc *sch) { struct net_device *dev = qdisc_dev(sch); struct mqprio_sched *priv = qdisc_priv(sch); unsigned int ntx; if (priv->qdiscs) { for (ntx = 0; ntx < dev->num_tx_queues && priv->qdiscs[ntx]; ntx++) qdisc_put(priv->qdiscs[ntx]); kfree(priv->qdiscs); } if (priv->hw_offload && dev->netdev_ops->ndo_setup_tc) mqprio_disable_offload(sch); else netdev_set_num_tc(dev, 0); } static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt, const struct tc_mqprio_caps *caps, struct netlink_ext_ack *extack) { int err; /* Limit qopt->hw to maximum supported offload value. Drivers have * the option of overriding this later if they don't support the a * given offload type. */ if (qopt->hw > TC_MQPRIO_HW_OFFLOAD_MAX) qopt->hw = TC_MQPRIO_HW_OFFLOAD_MAX; /* If hardware offload is requested, we will leave 3 options to the * device driver: * - populate the queue counts itself (and ignore what was requested) * - validate the provided queue counts by itself (and apply them) * - request queue count validation here (and apply them) */ err = mqprio_validate_qopt(dev, qopt, !qopt->hw || caps->validate_queue_counts, false, extack); if (err) return err; /* If ndo_setup_tc is not present then hardware doesn't support offload * and we should return an error. */ if (qopt->hw && !dev->netdev_ops->ndo_setup_tc) { NL_SET_ERR_MSG(extack, "Device does not support hardware offload"); return -EINVAL; } return 0; } static const struct nla_policy mqprio_tc_entry_policy[TCA_MQPRIO_TC_ENTRY_MAX + 1] = { [TCA_MQPRIO_TC_ENTRY_INDEX] = NLA_POLICY_MAX(NLA_U32, TC_QOPT_MAX_QUEUE - 1), [TCA_MQPRIO_TC_ENTRY_FP] = NLA_POLICY_RANGE(NLA_U32, TC_FP_EXPRESS, TC_FP_PREEMPTIBLE), }; static const struct nla_policy mqprio_policy[TCA_MQPRIO_MAX + 1] = { [TCA_MQPRIO_MODE] = { .len = sizeof(u16) }, [TCA_MQPRIO_SHAPER] = { .len = sizeof(u16) }, [TCA_MQPRIO_MIN_RATE64] = { .type = NLA_NESTED }, [TCA_MQPRIO_MAX_RATE64] = { .type = NLA_NESTED }, [TCA_MQPRIO_TC_ENTRY] = { .type = NLA_NESTED }, }; static int mqprio_parse_tc_entry(u32 fp[TC_QOPT_MAX_QUEUE], struct nlattr *opt, unsigned long *seen_tcs, struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_MQPRIO_TC_ENTRY_MAX + 1]; int err, tc; err = nla_parse_nested(tb, TCA_MQPRIO_TC_ENTRY_MAX, opt, mqprio_tc_entry_policy, extack); if (err < 0) return err; if (NL_REQ_ATTR_CHECK(extack, opt, tb, TCA_MQPRIO_TC_ENTRY_INDEX)) { NL_SET_ERR_MSG(extack, "TC entry index missing"); return -EINVAL; } tc = nla_get_u32(tb[TCA_MQPRIO_TC_ENTRY_INDEX]); if (*seen_tcs & BIT(tc)) { NL_SET_ERR_MSG_ATTR(extack, tb[TCA_MQPRIO_TC_ENTRY_INDEX], "Duplicate tc entry"); return -EINVAL; } *seen_tcs |= BIT(tc); if (tb[TCA_MQPRIO_TC_ENTRY_FP]) fp[tc] = nla_get_u32(tb[TCA_MQPRIO_TC_ENTRY_FP]); return 0; } static int mqprio_parse_tc_entries(struct Qdisc *sch, struct nlattr *nlattr_opt, int nlattr_opt_len, struct netlink_ext_ack *extack) { struct mqprio_sched *priv = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); bool have_preemption = false; unsigned long seen_tcs = 0; u32 fp[TC_QOPT_MAX_QUEUE]; struct nlattr *n; int tc, rem; int err = 0; for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) fp[tc] = priv->fp[tc]; nla_for_each_attr_type(n, TCA_MQPRIO_TC_ENTRY, nlattr_opt, nlattr_opt_len, rem) { err = mqprio_parse_tc_entry(fp, n, &seen_tcs, extack); if (err) goto out; } for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) { priv->fp[tc] = fp[tc]; if (fp[tc] == TC_FP_PREEMPTIBLE) have_preemption = true; } if (have_preemption && !ethtool_dev_mm_supported(dev)) { NL_SET_ERR_MSG(extack, "Device does not support preemption"); return -EOPNOTSUPP; } out: return err; } /* Parse the other netlink attributes that represent the payload of * TCA_OPTIONS, which are appended right after struct tc_mqprio_qopt. */ static int mqprio_parse_nlattr(struct Qdisc *sch, struct tc_mqprio_qopt *qopt, struct nlattr *opt, struct netlink_ext_ack *extack) { struct nlattr *nlattr_opt = nla_data(opt) + NLA_ALIGN(sizeof(*qopt)); int nlattr_opt_len = nla_len(opt) - NLA_ALIGN(sizeof(*qopt)); struct mqprio_sched *priv = qdisc_priv(sch); struct nlattr *tb[TCA_MQPRIO_MAX + 1] = {}; struct nlattr *attr; int i, rem, err; if (nlattr_opt_len >= nla_attr_size(0)) { err = nla_parse_deprecated(tb, TCA_MQPRIO_MAX, nlattr_opt, nlattr_opt_len, mqprio_policy, NULL); if (err < 0) return err; } if (!qopt->hw) { NL_SET_ERR_MSG(extack, "mqprio TCA_OPTIONS can only contain netlink attributes in hardware mode"); return -EINVAL; } if (tb[TCA_MQPRIO_MODE]) { priv->flags |= TC_MQPRIO_F_MODE; priv->mode = nla_get_u16(tb[TCA_MQPRIO_MODE]); } if (tb[TCA_MQPRIO_SHAPER]) { priv->flags |= TC_MQPRIO_F_SHAPER; priv->shaper = nla_get_u16(tb[TCA_MQPRIO_SHAPER]); } if (tb[TCA_MQPRIO_MIN_RATE64]) { if (priv->shaper != TC_MQPRIO_SHAPER_BW_RATE) { NL_SET_ERR_MSG_ATTR(extack, tb[TCA_MQPRIO_MIN_RATE64], "min_rate accepted only when shaper is in bw_rlimit mode"); return -EINVAL; } i = 0; nla_for_each_nested(attr, tb[TCA_MQPRIO_MIN_RATE64], rem) { if (nla_type(attr) != TCA_MQPRIO_MIN_RATE64) { NL_SET_ERR_MSG_ATTR(extack, attr, "Attribute type expected to be TCA_MQPRIO_MIN_RATE64"); return -EINVAL; } if (nla_len(attr) != sizeof(u64)) { NL_SET_ERR_MSG_ATTR(extack, attr, "Attribute TCA_MQPRIO_MIN_RATE64 expected to have 8 bytes length"); return -EINVAL; } if (i >= qopt->num_tc) break; priv->min_rate[i] = nla_get_u64(attr); i++; } priv->flags |= TC_MQPRIO_F_MIN_RATE; } if (tb[TCA_MQPRIO_MAX_RATE64]) { if (priv->shaper != TC_MQPRIO_SHAPER_BW_RATE) { NL_SET_ERR_MSG_ATTR(extack, tb[TCA_MQPRIO_MAX_RATE64], "max_rate accepted only when shaper is in bw_rlimit mode"); return -EINVAL; } i = 0; nla_for_each_nested(attr, tb[TCA_MQPRIO_MAX_RATE64], rem) { if (nla_type(attr) != TCA_MQPRIO_MAX_RATE64) { NL_SET_ERR_MSG_ATTR(extack, attr, "Attribute type expected to be TCA_MQPRIO_MAX_RATE64"); return -EINVAL; } if (nla_len(attr) != sizeof(u64)) { NL_SET_ERR_MSG_ATTR(extack, attr, "Attribute TCA_MQPRIO_MAX_RATE64 expected to have 8 bytes length"); return -EINVAL; } if (i >= qopt->num_tc) break; priv->max_rate[i] = nla_get_u64(attr); i++; } priv->flags |= TC_MQPRIO_F_MAX_RATE; } if (tb[TCA_MQPRIO_TC_ENTRY]) { err = mqprio_parse_tc_entries(sch, nlattr_opt, nlattr_opt_len, extack); if (err) return err; } return 0; } static int mqprio_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct net_device *dev = qdisc_dev(sch); struct mqprio_sched *priv = qdisc_priv(sch); struct netdev_queue *dev_queue; struct Qdisc *qdisc; int i, err = -EOPNOTSUPP; struct tc_mqprio_qopt *qopt = NULL; struct tc_mqprio_caps caps; int len, tc; BUILD_BUG_ON(TC_MAX_QUEUE != TC_QOPT_MAX_QUEUE); BUILD_BUG_ON(TC_BITMASK != TC_QOPT_BITMASK); if (sch->parent != TC_H_ROOT) return -EOPNOTSUPP; if (!netif_is_multiqueue(dev)) return -EOPNOTSUPP; /* make certain can allocate enough classids to handle queues */ if (dev->num_tx_queues >= TC_H_MIN_PRIORITY) return -ENOMEM; if (!opt || nla_len(opt) < sizeof(*qopt)) return -EINVAL; for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) priv->fp[tc] = TC_FP_EXPRESS; qdisc_offload_query_caps(dev, TC_SETUP_QDISC_MQPRIO, &caps, sizeof(caps)); qopt = nla_data(opt); if (mqprio_parse_opt(dev, qopt, &caps, extack)) return -EINVAL; len = nla_len(opt) - NLA_ALIGN(sizeof(*qopt)); if (len > 0) { err = mqprio_parse_nlattr(sch, qopt, opt, extack); if (err) return err; } /* pre-allocate qdisc, attachment can't fail */ priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]), GFP_KERNEL); if (!priv->qdiscs) return -ENOMEM; for (i = 0; i < dev->num_tx_queues; i++) { dev_queue = netdev_get_tx_queue(dev, i); qdisc = qdisc_create_dflt(dev_queue, get_default_qdisc_ops(dev, i), TC_H_MAKE(TC_H_MAJ(sch->handle), TC_H_MIN(i + 1)), extack); if (!qdisc) return -ENOMEM; priv->qdiscs[i] = qdisc; qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; } /* If the mqprio options indicate that hardware should own * the queue mapping then run ndo_setup_tc otherwise use the * supplied and verified mapping */ if (qopt->hw) { err = mqprio_enable_offload(sch, qopt, extack); if (err) return err; } else { netdev_set_num_tc(dev, qopt->num_tc); for (i = 0; i < qopt->num_tc; i++) netdev_set_tc_queue(dev, i, qopt->count[i], qopt->offset[i]); } /* Always use supplied priority mappings */ for (i = 0; i < TC_BITMASK + 1; i++) netdev_set_prio_tc_map(dev, i, qopt->prio_tc_map[i]); sch->flags |= TCQ_F_MQROOT; return 0; } static void mqprio_attach(struct Qdisc *sch) { struct net_device *dev = qdisc_dev(sch); struct mqprio_sched *priv = qdisc_priv(sch); struct Qdisc *qdisc, *old; unsigned int ntx; /* Attach underlying qdisc */ for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { qdisc = priv->qdiscs[ntx]; old = dev_graft_qdisc(qdisc->dev_queue, qdisc); if (old) qdisc_put(old); if (ntx < dev->real_num_tx_queues) qdisc_hash_add(qdisc, false); } kfree(priv->qdiscs); priv->qdiscs = NULL; } static struct netdev_queue *mqprio_queue_get(struct Qdisc *sch, unsigned long cl) { struct net_device *dev = qdisc_dev(sch); unsigned long ntx = cl - 1; if (ntx >= dev->num_tx_queues) return NULL; return netdev_get_tx_queue(dev, ntx); } static int mqprio_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, struct Qdisc **old, struct netlink_ext_ack *extack) { struct net_device *dev = qdisc_dev(sch); struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl); if (!dev_queue) return -EINVAL; if (dev->flags & IFF_UP) dev_deactivate(dev); *old = dev_graft_qdisc(dev_queue, new); if (new) new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; if (dev->flags & IFF_UP) dev_activate(dev); return 0; } static int dump_rates(struct mqprio_sched *priv, struct tc_mqprio_qopt *opt, struct sk_buff *skb) { struct nlattr *nest; int i; if (priv->flags & TC_MQPRIO_F_MIN_RATE) { nest = nla_nest_start_noflag(skb, TCA_MQPRIO_MIN_RATE64); if (!nest) goto nla_put_failure; for (i = 0; i < opt->num_tc; i++) { if (nla_put(skb, TCA_MQPRIO_MIN_RATE64, sizeof(priv->min_rate[i]), &priv->min_rate[i])) goto nla_put_failure; } nla_nest_end(skb, nest); } if (priv->flags & TC_MQPRIO_F_MAX_RATE) { nest = nla_nest_start_noflag(skb, TCA_MQPRIO_MAX_RATE64); if (!nest) goto nla_put_failure; for (i = 0; i < opt->num_tc; i++) { if (nla_put(skb, TCA_MQPRIO_MAX_RATE64, sizeof(priv->max_rate[i]), &priv->max_rate[i])) goto nla_put_failure; } nla_nest_end(skb, nest); } return 0; nla_put_failure: nla_nest_cancel(skb, nest); return -1; } static int mqprio_dump_tc_entries(struct mqprio_sched *priv, struct sk_buff *skb) { struct nlattr *n; int tc; for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) { n = nla_nest_start(skb, TCA_MQPRIO_TC_ENTRY); if (!n) return -EMSGSIZE; if (nla_put_u32(skb, TCA_MQPRIO_TC_ENTRY_INDEX, tc)) goto nla_put_failure; if (nla_put_u32(skb, TCA_MQPRIO_TC_ENTRY_FP, priv->fp[tc])) goto nla_put_failure; nla_nest_end(skb, n); } return 0; nla_put_failure: nla_nest_cancel(skb, n); return -EMSGSIZE; } static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb) { struct net_device *dev = qdisc_dev(sch); struct mqprio_sched *priv = qdisc_priv(sch); struct nlattr *nla = (struct nlattr *)skb_tail_pointer(skb); struct tc_mqprio_qopt opt = { 0 }; struct Qdisc *qdisc; unsigned int ntx; sch->q.qlen = 0; gnet_stats_basic_sync_init(&sch->bstats); memset(&sch->qstats, 0, sizeof(sch->qstats)); /* MQ supports lockless qdiscs. However, statistics accounting needs * to account for all, none, or a mix of locked and unlocked child * qdiscs. Percpu stats are added to counters in-band and locking * qdisc totals are added at end. */ for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { qdisc = rtnl_dereference(netdev_get_tx_queue(dev, ntx)->qdisc_sleeping); spin_lock_bh(qdisc_lock(qdisc)); gnet_stats_add_basic(&sch->bstats, qdisc->cpu_bstats, &qdisc->bstats, false); gnet_stats_add_queue(&sch->qstats, qdisc->cpu_qstats, &qdisc->qstats); sch->q.qlen += qdisc_qlen(qdisc); spin_unlock_bh(qdisc_lock(qdisc)); } mqprio_qopt_reconstruct(dev, &opt); opt.hw = priv->hw_offload; if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) goto nla_put_failure; if ((priv->flags & TC_MQPRIO_F_MODE) && nla_put_u16(skb, TCA_MQPRIO_MODE, priv->mode)) goto nla_put_failure; if ((priv->flags & TC_MQPRIO_F_SHAPER) && nla_put_u16(skb, TCA_MQPRIO_SHAPER, priv->shaper)) goto nla_put_failure; if ((priv->flags & TC_MQPRIO_F_MIN_RATE || priv->flags & TC_MQPRIO_F_MAX_RATE) && (dump_rates(priv, &opt, skb) != 0)) goto nla_put_failure; if (mqprio_dump_tc_entries(priv, skb)) goto nla_put_failure; return nla_nest_end(skb, nla); nla_put_failure: nlmsg_trim(skb, nla); return -1; } static struct Qdisc *mqprio_leaf(struct Qdisc *sch, unsigned long cl) { struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl); if (!dev_queue) return NULL; return rtnl_dereference(dev_queue->qdisc_sleeping); } static unsigned long mqprio_find(struct Qdisc *sch, u32 classid) { struct net_device *dev = qdisc_dev(sch); unsigned int ntx = TC_H_MIN(classid); /* There are essentially two regions here that have valid classid * values. The first region will have a classid value of 1 through * num_tx_queues. All of these are backed by actual Qdiscs. */ if (ntx < TC_H_MIN_PRIORITY) return (ntx <= dev->num_tx_queues) ? ntx : 0; /* The second region represents the hardware traffic classes. These * are represented by classid values of TC_H_MIN_PRIORITY through * TC_H_MIN_PRIORITY + netdev_get_num_tc - 1 */ return ((ntx - TC_H_MIN_PRIORITY) < netdev_get_num_tc(dev)) ? ntx : 0; } static int mqprio_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) { if (cl < TC_H_MIN_PRIORITY) { struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl); struct net_device *dev = qdisc_dev(sch); int tc = netdev_txq_to_tc(dev, cl - 1); tcm->tcm_parent = (tc < 0) ? 0 : TC_H_MAKE(TC_H_MAJ(sch->handle), TC_H_MIN(tc + TC_H_MIN_PRIORITY)); tcm->tcm_info = rtnl_dereference(dev_queue->qdisc_sleeping)->handle; } else { tcm->tcm_parent = TC_H_ROOT; tcm->tcm_info = 0; } tcm->tcm_handle |= TC_H_MIN(cl); return 0; } static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct gnet_dump *d) __releases(d->lock) __acquires(d->lock) { if (cl >= TC_H_MIN_PRIORITY) { int i; __u32 qlen; struct gnet_stats_queue qstats = {0}; struct gnet_stats_basic_sync bstats; struct net_device *dev = qdisc_dev(sch); struct netdev_tc_txq tc = dev->tc_to_txq[cl & TC_BITMASK]; gnet_stats_basic_sync_init(&bstats); /* Drop lock here it will be reclaimed before touching * statistics this is required because the d->lock we * hold here is the look on dev_queue->qdisc_sleeping * also acquired below. */ if (d->lock) spin_unlock_bh(d->lock); for (i = tc.offset; i < tc.offset + tc.count; i++) { struct netdev_queue *q = netdev_get_tx_queue(dev, i); struct Qdisc *qdisc = rtnl_dereference(q->qdisc); spin_lock_bh(qdisc_lock(qdisc)); gnet_stats_add_basic(&bstats, qdisc->cpu_bstats, &qdisc->bstats, false); gnet_stats_add_queue(&qstats, qdisc->cpu_qstats, &qdisc->qstats); sch->q.qlen += qdisc_qlen(qdisc); spin_unlock_bh(qdisc_lock(qdisc)); } qlen = qdisc_qlen(sch) + qstats.qlen; /* Reclaim root sleeping lock before completing stats */ if (d->lock) spin_lock_bh(d->lock); if (gnet_stats_copy_basic(d, NULL, &bstats, false) < 0 || gnet_stats_copy_queue(d, NULL, &qstats, qlen) < 0) return -1; } else { struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl); sch = rtnl_dereference(dev_queue->qdisc_sleeping); if (gnet_stats_copy_basic(d, sch->cpu_bstats, &sch->bstats, true) < 0 || qdisc_qstats_copy(d, sch) < 0) return -1; } return 0; } static void mqprio_walk(struct Qdisc *sch, struct qdisc_walker *arg) { struct net_device *dev = qdisc_dev(sch); unsigned long ntx; if (arg->stop) return; /* Walk hierarchy with a virtual class per tc */ arg->count = arg->skip; for (ntx = arg->skip; ntx < netdev_get_num_tc(dev); ntx++) { if (!tc_qdisc_stats_dump(sch, ntx + TC_H_MIN_PRIORITY, arg)) return; } /* Pad the values and skip over unused traffic classes */ if (ntx < TC_MAX_QUEUE) { arg->count = TC_MAX_QUEUE; ntx = TC_MAX_QUEUE; } /* Reset offset, sort out remaining per-queue qdiscs */ for (ntx -= TC_MAX_QUEUE; ntx < dev->num_tx_queues; ntx++) { if (arg->fn(sch, ntx + 1, arg) < 0) { arg->stop = 1; return; } arg->count++; } } static struct netdev_queue *mqprio_select_queue(struct Qdisc *sch, struct tcmsg *tcm) { return mqprio_queue_get(sch, TC_H_MIN(tcm->tcm_parent)); } static const struct Qdisc_class_ops mqprio_class_ops = { .graft = mqprio_graft, .leaf = mqprio_leaf, .find = mqprio_find, .walk = mqprio_walk, .dump = mqprio_dump_class, .dump_stats = mqprio_dump_class_stats, .select_queue = mqprio_select_queue, }; static struct Qdisc_ops mqprio_qdisc_ops __read_mostly = { .cl_ops = &mqprio_class_ops, .id = "mqprio", .priv_size = sizeof(struct mqprio_sched), .init = mqprio_init, .destroy = mqprio_destroy, .attach = mqprio_attach, .change_real_num_tx = mq_change_real_num_tx, .dump = mqprio_dump, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_SCH("mqprio"); static int __init mqprio_module_init(void) { return register_qdisc(&mqprio_qdisc_ops); } static void __exit mqprio_module_exit(void) { unregister_qdisc(&mqprio_qdisc_ops); } module_init(mqprio_module_init); module_exit(mqprio_module_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Classful multiqueue prio qdisc");
11 4 18 14 14 15 15 3 19 24 13 3 11 13 13 13 12 3 3 3 8 6 9 9 1 5 2 2 2 7 4 3 8 3 6 3 6 14 2 6 9 8 3 2 1 3 3 3 2 1 2 10 2 3 5 7 1 1 18 4 22 22 22 22 24 1 8 22 34 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 // SPDX-License-Identifier: GPL-2.0 #include <linux/bpf.h> #include <linux/bpf-netns.h> #include <linux/filter.h> #include <net/net_namespace.h> /* * Functions to manage BPF programs attached to netns */ struct bpf_netns_link { struct bpf_link link; /* We don't hold a ref to net in order to auto-detach the link * when netns is going away. Instead we rely on pernet * pre_exit callback to clear this pointer. Must be accessed * with netns_bpf_mutex held. */ struct net *net; struct list_head node; /* node in list of links attached to net */ enum netns_bpf_attach_type netns_type; }; /* Protects updates to netns_bpf */ DEFINE_MUTEX(netns_bpf_mutex); static void netns_bpf_attach_type_unneed(enum netns_bpf_attach_type type) { switch (type) { #ifdef CONFIG_INET case NETNS_BPF_SK_LOOKUP: static_branch_dec(&bpf_sk_lookup_enabled); break; #endif default: break; } } static void netns_bpf_attach_type_need(enum netns_bpf_attach_type type) { switch (type) { #ifdef CONFIG_INET case NETNS_BPF_SK_LOOKUP: static_branch_inc(&bpf_sk_lookup_enabled); break; #endif default: break; } } /* Must be called with netns_bpf_mutex held. */ static void netns_bpf_run_array_detach(struct net *net, enum netns_bpf_attach_type type) { struct bpf_prog_array *run_array; run_array = rcu_replace_pointer(net->bpf.run_array[type], NULL, lockdep_is_held(&netns_bpf_mutex)); bpf_prog_array_free(run_array); } static int link_index(struct net *net, enum netns_bpf_attach_type type, struct bpf_netns_link *link) { struct bpf_netns_link *pos; int i = 0; list_for_each_entry(pos, &net->bpf.links[type], node) { if (pos == link) return i; i++; } return -ENOENT; } static int link_count(struct net *net, enum netns_bpf_attach_type type) { struct list_head *pos; int i = 0; list_for_each(pos, &net->bpf.links[type]) i++; return i; } static void fill_prog_array(struct net *net, enum netns_bpf_attach_type type, struct bpf_prog_array *prog_array) { struct bpf_netns_link *pos; unsigned int i = 0; list_for_each_entry(pos, &net->bpf.links[type], node) { prog_array->items[i].prog = pos->link.prog; i++; } } static void bpf_netns_link_release(struct bpf_link *link) { struct bpf_netns_link *net_link = container_of(link, struct bpf_netns_link, link); enum netns_bpf_attach_type type = net_link->netns_type; struct bpf_prog_array *old_array, *new_array; struct net *net; int cnt, idx; mutex_lock(&netns_bpf_mutex); /* We can race with cleanup_net, but if we see a non-NULL * struct net pointer, pre_exit has not run yet and wait for * netns_bpf_mutex. */ net = net_link->net; if (!net) goto out_unlock; /* Mark attach point as unused */ netns_bpf_attach_type_unneed(type); /* Remember link position in case of safe delete */ idx = link_index(net, type, net_link); list_del(&net_link->node); cnt = link_count(net, type); if (!cnt) { netns_bpf_run_array_detach(net, type); goto out_unlock; } old_array = rcu_dereference_protected(net->bpf.run_array[type], lockdep_is_held(&netns_bpf_mutex)); new_array = bpf_prog_array_alloc(cnt, GFP_KERNEL); if (!new_array) { WARN_ON(bpf_prog_array_delete_safe_at(old_array, idx)); goto out_unlock; } fill_prog_array(net, type, new_array); rcu_assign_pointer(net->bpf.run_array[type], new_array); bpf_prog_array_free(old_array); out_unlock: net_link->net = NULL; mutex_unlock(&netns_bpf_mutex); } static int bpf_netns_link_detach(struct bpf_link *link) { bpf_netns_link_release(link); return 0; } static void bpf_netns_link_dealloc(struct bpf_link *link) { struct bpf_netns_link *net_link = container_of(link, struct bpf_netns_link, link); kfree(net_link); } static int bpf_netns_link_update_prog(struct bpf_link *link, struct bpf_prog *new_prog, struct bpf_prog *old_prog) { struct bpf_netns_link *net_link = container_of(link, struct bpf_netns_link, link); enum netns_bpf_attach_type type = net_link->netns_type; struct bpf_prog_array *run_array; struct net *net; int idx, ret; if (old_prog && old_prog != link->prog) return -EPERM; if (new_prog->type != link->prog->type) return -EINVAL; mutex_lock(&netns_bpf_mutex); net = net_link->net; if (!net || !check_net(net)) { /* Link auto-detached or netns dying */ ret = -ENOLINK; goto out_unlock; } run_array = rcu_dereference_protected(net->bpf.run_array[type], lockdep_is_held(&netns_bpf_mutex)); idx = link_index(net, type, net_link); ret = bpf_prog_array_update_at(run_array, idx, new_prog); if (ret) goto out_unlock; old_prog = xchg(&link->prog, new_prog); bpf_prog_put(old_prog); out_unlock: mutex_unlock(&netns_bpf_mutex); return ret; } static int bpf_netns_link_fill_info(const struct bpf_link *link, struct bpf_link_info *info) { const struct bpf_netns_link *net_link = container_of(link, struct bpf_netns_link, link); unsigned int inum = 0; struct net *net; mutex_lock(&netns_bpf_mutex); net = net_link->net; if (net && check_net(net)) inum = net->ns.inum; mutex_unlock(&netns_bpf_mutex); info->netns.netns_ino = inum; info->netns.attach_type = link->attach_type; return 0; } static void bpf_netns_link_show_fdinfo(const struct bpf_link *link, struct seq_file *seq) { struct bpf_link_info info = {}; bpf_netns_link_fill_info(link, &info); seq_printf(seq, "netns_ino:\t%u\n" "attach_type:\t%u\n", info.netns.netns_ino, link->attach_type); } static const struct bpf_link_ops bpf_netns_link_ops = { .release = bpf_netns_link_release, .dealloc = bpf_netns_link_dealloc, .detach = bpf_netns_link_detach, .update_prog = bpf_netns_link_update_prog, .fill_link_info = bpf_netns_link_fill_info, .show_fdinfo = bpf_netns_link_show_fdinfo, }; /* Must be called with netns_bpf_mutex held. */ static int __netns_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr, struct net *net, enum netns_bpf_attach_type type) { __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); struct bpf_prog_array *run_array; u32 prog_cnt = 0, flags = 0; run_array = rcu_dereference_protected(net->bpf.run_array[type], lockdep_is_held(&netns_bpf_mutex)); if (run_array) prog_cnt = bpf_prog_array_length(run_array); if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) return -EFAULT; if (copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt))) return -EFAULT; if (!attr->query.prog_cnt || !prog_ids || !prog_cnt) return 0; return bpf_prog_array_copy_to_user(run_array, prog_ids, attr->query.prog_cnt); } int netns_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { enum netns_bpf_attach_type type; struct net *net; int ret; if (attr->query.query_flags) return -EINVAL; type = to_netns_bpf_attach_type(attr->query.attach_type); if (type < 0) return -EINVAL; net = get_net_ns_by_fd(attr->query.target_fd); if (IS_ERR(net)) return PTR_ERR(net); mutex_lock(&netns_bpf_mutex); ret = __netns_bpf_prog_query(attr, uattr, net, type); mutex_unlock(&netns_bpf_mutex); put_net(net); return ret; } int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) { struct bpf_prog_array *run_array; enum netns_bpf_attach_type type; struct bpf_prog *attached; struct net *net; int ret; if (attr->target_fd || attr->attach_flags || attr->replace_bpf_fd) return -EINVAL; type = to_netns_bpf_attach_type(attr->attach_type); if (type < 0) return -EINVAL; net = current->nsproxy->net_ns; mutex_lock(&netns_bpf_mutex); /* Attaching prog directly is not compatible with links */ if (!list_empty(&net->bpf.links[type])) { ret = -EEXIST; goto out_unlock; } switch (type) { case NETNS_BPF_FLOW_DISSECTOR: ret = flow_dissector_bpf_prog_attach_check(net, prog); break; default: ret = -EINVAL; break; } if (ret) goto out_unlock; attached = net->bpf.progs[type]; if (attached == prog) { /* The same program cannot be attached twice */ ret = -EINVAL; goto out_unlock; } run_array = rcu_dereference_protected(net->bpf.run_array[type], lockdep_is_held(&netns_bpf_mutex)); if (run_array) { WRITE_ONCE(run_array->items[0].prog, prog); } else { run_array = bpf_prog_array_alloc(1, GFP_KERNEL); if (!run_array) { ret = -ENOMEM; goto out_unlock; } run_array->items[0].prog = prog; rcu_assign_pointer(net->bpf.run_array[type], run_array); } net->bpf.progs[type] = prog; if (attached) bpf_prog_put(attached); out_unlock: mutex_unlock(&netns_bpf_mutex); return ret; } /* Must be called with netns_bpf_mutex held. */ static int __netns_bpf_prog_detach(struct net *net, enum netns_bpf_attach_type type, struct bpf_prog *old) { struct bpf_prog *attached; /* Progs attached via links cannot be detached */ if (!list_empty(&net->bpf.links[type])) return -EINVAL; attached = net->bpf.progs[type]; if (!attached || attached != old) return -ENOENT; netns_bpf_run_array_detach(net, type); net->bpf.progs[type] = NULL; bpf_prog_put(attached); return 0; } int netns_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) { enum netns_bpf_attach_type type; struct bpf_prog *prog; int ret; if (attr->target_fd) return -EINVAL; type = to_netns_bpf_attach_type(attr->attach_type); if (type < 0) return -EINVAL; prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); if (IS_ERR(prog)) return PTR_ERR(prog); mutex_lock(&netns_bpf_mutex); ret = __netns_bpf_prog_detach(current->nsproxy->net_ns, type, prog); mutex_unlock(&netns_bpf_mutex); bpf_prog_put(prog); return ret; } static int netns_bpf_max_progs(enum netns_bpf_attach_type type) { switch (type) { case NETNS_BPF_FLOW_DISSECTOR: return 1; case NETNS_BPF_SK_LOOKUP: return 64; default: return 0; } } static int netns_bpf_link_attach(struct net *net, struct bpf_link *link, enum netns_bpf_attach_type type) { struct bpf_netns_link *net_link = container_of(link, struct bpf_netns_link, link); struct bpf_prog_array *run_array; int cnt, err; mutex_lock(&netns_bpf_mutex); cnt = link_count(net, type); if (cnt >= netns_bpf_max_progs(type)) { err = -E2BIG; goto out_unlock; } /* Links are not compatible with attaching prog directly */ if (net->bpf.progs[type]) { err = -EEXIST; goto out_unlock; } switch (type) { case NETNS_BPF_FLOW_DISSECTOR: err = flow_dissector_bpf_prog_attach_check(net, link->prog); break; case NETNS_BPF_SK_LOOKUP: err = 0; /* nothing to check */ break; default: err = -EINVAL; break; } if (err) goto out_unlock; run_array = bpf_prog_array_alloc(cnt + 1, GFP_KERNEL); if (!run_array) { err = -ENOMEM; goto out_unlock; } list_add_tail(&net_link->node, &net->bpf.links[type]); fill_prog_array(net, type, run_array); run_array = rcu_replace_pointer(net->bpf.run_array[type], run_array, lockdep_is_held(&netns_bpf_mutex)); bpf_prog_array_free(run_array); /* Mark attach point as used */ netns_bpf_attach_type_need(type); out_unlock: mutex_unlock(&netns_bpf_mutex); return err; } int netns_bpf_link_create(const union bpf_attr *attr, struct bpf_prog *prog) { enum netns_bpf_attach_type netns_type; struct bpf_link_primer link_primer; struct bpf_netns_link *net_link; enum bpf_attach_type type; struct net *net; int err; if (attr->link_create.flags) return -EINVAL; type = attr->link_create.attach_type; netns_type = to_netns_bpf_attach_type(type); if (netns_type < 0) return -EINVAL; net = get_net_ns_by_fd(attr->link_create.target_fd); if (IS_ERR(net)) return PTR_ERR(net); net_link = kzalloc(sizeof(*net_link), GFP_USER); if (!net_link) { err = -ENOMEM; goto out_put_net; } bpf_link_init(&net_link->link, BPF_LINK_TYPE_NETNS, &bpf_netns_link_ops, prog, type); net_link->net = net; net_link->netns_type = netns_type; err = bpf_link_prime(&net_link->link, &link_primer); if (err) { kfree(net_link); goto out_put_net; } err = netns_bpf_link_attach(net, &net_link->link, netns_type); if (err) { bpf_link_cleanup(&link_primer); goto out_put_net; } put_net(net); return bpf_link_settle(&link_primer); out_put_net: put_net(net); return err; } static int __net_init netns_bpf_pernet_init(struct net *net) { int type; for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) INIT_LIST_HEAD(&net->bpf.links[type]); return 0; } static void __net_exit netns_bpf_pernet_pre_exit(struct net *net) { enum netns_bpf_attach_type type; struct bpf_netns_link *net_link; mutex_lock(&netns_bpf_mutex); for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) { netns_bpf_run_array_detach(net, type); list_for_each_entry(net_link, &net->bpf.links[type], node) { net_link->net = NULL; /* auto-detach link */ netns_bpf_attach_type_unneed(type); } if (net->bpf.progs[type]) bpf_prog_put(net->bpf.progs[type]); } mutex_unlock(&netns_bpf_mutex); } static struct pernet_operations netns_bpf_pernet_ops __net_initdata = { .init = netns_bpf_pernet_init, .pre_exit = netns_bpf_pernet_pre_exit, }; static int __init netns_bpf_init(void) { return register_pernet_subsys(&netns_bpf_pernet_ops); } subsys_initcall(netns_bpf_init);
7 7 7 7 7 7 7 7 7 7 6 6 6 6 6 6 7 7 6 6 6 6 6 6 6 6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2001 Jens Axboe <axboe@kernel.dk> */ #include <linux/mm.h> #include <linux/swap.h> #include <linux/bio-integrity.h> #include <linux/blkdev.h> #include <linux/uio.h> #include <linux/iocontext.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/mempool.h> #include <linux/workqueue.h> #include <linux/cgroup.h> #include <linux/highmem.h> #include <linux/blk-crypto.h> #include <linux/xarray.h> #include <trace/events/block.h> #include "blk.h" #include "blk-rq-qos.h" #include "blk-cgroup.h" #define ALLOC_CACHE_THRESHOLD 16 #define ALLOC_CACHE_MAX 256 struct bio_alloc_cache { struct bio *free_list; struct bio *free_list_irq; unsigned int nr; unsigned int nr_irq; }; static struct biovec_slab { int nr_vecs; char *name; struct kmem_cache *slab; } bvec_slabs[] __read_mostly = { { .nr_vecs = 16, .name = "biovec-16" }, { .nr_vecs = 64, .name = "biovec-64" }, { .nr_vecs = 128, .name = "biovec-128" }, { .nr_vecs = BIO_MAX_VECS, .name = "biovec-max" }, }; static struct biovec_slab *biovec_slab(unsigned short nr_vecs) { switch (nr_vecs) { /* smaller bios use inline vecs */ case 5 ... 16: return &bvec_slabs[0]; case 17 ... 64: return &bvec_slabs[1]; case 65 ... 128: return &bvec_slabs[2]; case 129 ... BIO_MAX_VECS: return &bvec_slabs[3]; default: BUG(); return NULL; } } /* * fs_bio_set is the bio_set containing bio and iovec memory pools used by * IO code that does not need private memory pools. */ struct bio_set fs_bio_set; EXPORT_SYMBOL(fs_bio_set); /* * Our slab pool management */ struct bio_slab { struct kmem_cache *slab; unsigned int slab_ref; unsigned int slab_size; char name[12]; }; static DEFINE_MUTEX(bio_slab_lock); static DEFINE_XARRAY(bio_slabs); static struct bio_slab *create_bio_slab(unsigned int size) { struct bio_slab *bslab = kzalloc(sizeof(*bslab), GFP_KERNEL); if (!bslab) return NULL; snprintf(bslab->name, sizeof(bslab->name), "bio-%d", size); bslab->slab = kmem_cache_create(bslab->name, size, ARCH_KMALLOC_MINALIGN, SLAB_HWCACHE_ALIGN | SLAB_TYPESAFE_BY_RCU, NULL); if (!bslab->slab) goto fail_alloc_slab; bslab->slab_ref = 1; bslab->slab_size = size; if (!xa_err(xa_store(&bio_slabs, size, bslab, GFP_KERNEL))) return bslab; kmem_cache_destroy(bslab->slab); fail_alloc_slab: kfree(bslab); return NULL; } static inline unsigned int bs_bio_slab_size(struct bio_set *bs) { return bs->front_pad + sizeof(struct bio) + bs->back_pad; } static struct kmem_cache *bio_find_or_create_slab(struct bio_set *bs) { unsigned int size = bs_bio_slab_size(bs); struct bio_slab *bslab; mutex_lock(&bio_slab_lock); bslab = xa_load(&bio_slabs, size); if (bslab) bslab->slab_ref++; else bslab = create_bio_slab(size); mutex_unlock(&bio_slab_lock); if (bslab) return bslab->slab; return NULL; } static void bio_put_slab(struct bio_set *bs) { struct bio_slab *bslab = NULL; unsigned int slab_size = bs_bio_slab_size(bs); mutex_lock(&bio_slab_lock); bslab = xa_load(&bio_slabs, slab_size); if (WARN(!bslab, KERN_ERR "bio: unable to find slab!\n")) goto out; WARN_ON_ONCE(bslab->slab != bs->bio_slab); WARN_ON(!bslab->slab_ref); if (--bslab->slab_ref) goto out; xa_erase(&bio_slabs, slab_size); kmem_cache_destroy(bslab->slab); kfree(bslab); out: mutex_unlock(&bio_slab_lock); } void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs) { BUG_ON(nr_vecs > BIO_MAX_VECS); if (nr_vecs == BIO_MAX_VECS) mempool_free(bv, pool); else if (nr_vecs > BIO_INLINE_VECS) kmem_cache_free(biovec_slab(nr_vecs)->slab, bv); } /* * Make the first allocation restricted and don't dump info on allocation * failures, since we'll fall back to the mempool in case of failure. */ static inline gfp_t bvec_alloc_gfp(gfp_t gfp) { return (gfp & ~(__GFP_DIRECT_RECLAIM | __GFP_IO)) | __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; } struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs, gfp_t gfp_mask) { struct biovec_slab *bvs = biovec_slab(*nr_vecs); if (WARN_ON_ONCE(!bvs)) return NULL; /* * Upgrade the nr_vecs request to take full advantage of the allocation. * We also rely on this in the bvec_free path. */ *nr_vecs = bvs->nr_vecs; /* * Try a slab allocation first for all smaller allocations. If that * fails and __GFP_DIRECT_RECLAIM is set retry with the mempool. * The mempool is sized to handle up to BIO_MAX_VECS entries. */ if (*nr_vecs < BIO_MAX_VECS) { struct bio_vec *bvl; bvl = kmem_cache_alloc(bvs->slab, bvec_alloc_gfp(gfp_mask)); if (likely(bvl) || !(gfp_mask & __GFP_DIRECT_RECLAIM)) return bvl; *nr_vecs = BIO_MAX_VECS; } return mempool_alloc(pool, gfp_mask); } void bio_uninit(struct bio *bio) { #ifdef CONFIG_BLK_CGROUP if (bio->bi_blkg) { blkg_put(bio->bi_blkg); bio->bi_blkg = NULL; } #endif if (bio_integrity(bio)) bio_integrity_free(bio); bio_crypt_free_ctx(bio); } EXPORT_SYMBOL(bio_uninit); static void bio_free(struct bio *bio) { struct bio_set *bs = bio->bi_pool; void *p = bio; WARN_ON_ONCE(!bs); bio_uninit(bio); bvec_free(&bs->bvec_pool, bio->bi_io_vec, bio->bi_max_vecs); mempool_free(p - bs->front_pad, &bs->bio_pool); } /* * Users of this function have their own bio allocation. Subsequently, * they must remember to pair any call to bio_init() with bio_uninit() * when IO has completed, or when the bio is released. */ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table, unsigned short max_vecs, blk_opf_t opf) { bio->bi_next = NULL; bio->bi_bdev = bdev; bio->bi_opf = opf; bio->bi_flags = 0; bio->bi_ioprio = 0; bio->bi_write_hint = 0; bio->bi_write_stream = 0; bio->bi_status = 0; bio->bi_bvec_gap_bit = 0; bio->bi_iter.bi_sector = 0; bio->bi_iter.bi_size = 0; bio->bi_iter.bi_idx = 0; bio->bi_iter.bi_bvec_done = 0; bio->bi_end_io = NULL; bio->bi_private = NULL; #ifdef CONFIG_BLK_CGROUP bio->bi_blkg = NULL; bio->issue_time_ns = 0; if (bdev) bio_associate_blkg(bio); #ifdef CONFIG_BLK_CGROUP_IOCOST bio->bi_iocost_cost = 0; #endif #endif #ifdef CONFIG_BLK_INLINE_ENCRYPTION bio->bi_crypt_context = NULL; #endif #ifdef CONFIG_BLK_DEV_INTEGRITY bio->bi_integrity = NULL; #endif bio->bi_vcnt = 0; atomic_set(&bio->__bi_remaining, 1); atomic_set(&bio->__bi_cnt, 1); bio->bi_cookie = BLK_QC_T_NONE; bio->bi_max_vecs = max_vecs; bio->bi_io_vec = table; bio->bi_pool = NULL; } EXPORT_SYMBOL(bio_init); /** * bio_reset - reinitialize a bio * @bio: bio to reset * @bdev: block device to use the bio for * @opf: operation and flags for bio * * Description: * After calling bio_reset(), @bio will be in the same state as a freshly * allocated bio returned bio bio_alloc_bioset() - the only fields that are * preserved are the ones that are initialized by bio_alloc_bioset(). See * comment in struct bio. */ void bio_reset(struct bio *bio, struct block_device *bdev, blk_opf_t opf) { bio_uninit(bio); memset(bio, 0, BIO_RESET_BYTES); atomic_set(&bio->__bi_remaining, 1); bio->bi_bdev = bdev; if (bio->bi_bdev) bio_associate_blkg(bio); bio->bi_opf = opf; } EXPORT_SYMBOL(bio_reset); static struct bio *__bio_chain_endio(struct bio *bio) { struct bio *parent = bio->bi_private; if (bio->bi_status && !parent->bi_status) parent->bi_status = bio->bi_status; bio_put(bio); return parent; } /* * This function should only be used as a flag and must never be called. * If execution reaches here, it indicates a serious programming error. */ static void bio_chain_endio(struct bio *bio) { BUG(); } /** * bio_chain - chain bio completions * @bio: the target bio * @parent: the parent bio of @bio * * The caller won't have a bi_end_io called when @bio completes - instead, * @parent's bi_end_io won't be called until both @parent and @bio have * completed; the chained bio will also be freed when it completes. * * The caller must not set bi_private or bi_end_io in @bio. */ void bio_chain(struct bio *bio, struct bio *parent) { BUG_ON(bio->bi_private || bio->bi_end_io); bio->bi_private = parent; bio->bi_end_io = bio_chain_endio; bio_inc_remaining(parent); } EXPORT_SYMBOL(bio_chain); /** * bio_chain_and_submit - submit a bio after chaining it to another one * @prev: bio to chain and submit * @new: bio to chain to * * If @prev is non-NULL, chain it to @new and submit it. * * Return: @new. */ struct bio *bio_chain_and_submit(struct bio *prev, struct bio *new) { if (prev) { bio_chain(prev, new); submit_bio(prev); } return new; } struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev, unsigned int nr_pages, blk_opf_t opf, gfp_t gfp) { return bio_chain_and_submit(bio, bio_alloc(bdev, nr_pages, opf, gfp)); } EXPORT_SYMBOL_GPL(blk_next_bio); static void bio_alloc_rescue(struct work_struct *work) { struct bio_set *bs = container_of(work, struct bio_set, rescue_work); struct bio *bio; while (1) { spin_lock(&bs->rescue_lock); bio = bio_list_pop(&bs->rescue_list); spin_unlock(&bs->rescue_lock); if (!bio) break; submit_bio_noacct(bio); } } static void punt_bios_to_rescuer(struct bio_set *bs) { struct bio_list punt, nopunt; struct bio *bio; if (WARN_ON_ONCE(!bs->rescue_workqueue)) return; /* * In order to guarantee forward progress we must punt only bios that * were allocated from this bio_set; otherwise, if there was a bio on * there for a stacking driver higher up in the stack, processing it * could require allocating bios from this bio_set, and doing that from * our own rescuer would be bad. * * Since bio lists are singly linked, pop them all instead of trying to * remove from the middle of the list: */ bio_list_init(&punt); bio_list_init(&nopunt); while ((bio = bio_list_pop(&current->bio_list[0]))) bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio); current->bio_list[0] = nopunt; bio_list_init(&nopunt); while ((bio = bio_list_pop(&current->bio_list[1]))) bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio); current->bio_list[1] = nopunt; spin_lock(&bs->rescue_lock); bio_list_merge(&bs->rescue_list, &punt); spin_unlock(&bs->rescue_lock); queue_work(bs->rescue_workqueue, &bs->rescue_work); } static void bio_alloc_irq_cache_splice(struct bio_alloc_cache *cache) { unsigned long flags; /* cache->free_list must be empty */ if (WARN_ON_ONCE(cache->free_list)) return; local_irq_save(flags); cache->free_list = cache->free_list_irq; cache->free_list_irq = NULL; cache->nr += cache->nr_irq; cache->nr_irq = 0; local_irq_restore(flags); } static struct bio *bio_alloc_percpu_cache(struct block_device *bdev, unsigned short nr_vecs, blk_opf_t opf, gfp_t gfp, struct bio_set *bs) { struct bio_alloc_cache *cache; struct bio *bio; cache = per_cpu_ptr(bs->cache, get_cpu()); if (!cache->free_list) { if (READ_ONCE(cache->nr_irq) >= ALLOC_CACHE_THRESHOLD) bio_alloc_irq_cache_splice(cache); if (!cache->free_list) { put_cpu(); return NULL; } } bio = cache->free_list; cache->free_list = bio->bi_next; cache->nr--; put_cpu(); if (nr_vecs) bio_init_inline(bio, bdev, nr_vecs, opf); else bio_init(bio, bdev, NULL, nr_vecs, opf); bio->bi_pool = bs; return bio; } /** * bio_alloc_bioset - allocate a bio for I/O * @bdev: block device to allocate the bio for (can be %NULL) * @nr_vecs: number of bvecs to pre-allocate * @opf: operation and flags for bio * @gfp_mask: the GFP_* mask given to the slab allocator * @bs: the bio_set to allocate from. * * Allocate a bio from the mempools in @bs. * * If %__GFP_DIRECT_RECLAIM is set then bio_alloc will always be able to * allocate a bio. This is due to the mempool guarantees. To make this work, * callers must never allocate more than 1 bio at a time from the general pool. * Callers that need to allocate more than 1 bio must always submit the * previously allocated bio for IO before attempting to allocate a new one. * Failure to do so can cause deadlocks under memory pressure. * * Note that when running under submit_bio_noacct() (i.e. any block driver), * bios are not submitted until after you return - see the code in * submit_bio_noacct() that converts recursion into iteration, to prevent * stack overflows. * * This would normally mean allocating multiple bios under submit_bio_noacct() * would be susceptible to deadlocks, but we have * deadlock avoidance code that resubmits any blocked bios from a rescuer * thread. * * However, we do not guarantee forward progress for allocations from other * mempools. Doing multiple allocations from the same mempool under * submit_bio_noacct() should be avoided - instead, use bio_set's front_pad * for per bio allocations. * * Returns: Pointer to new bio on success, NULL on failure. */ struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs, blk_opf_t opf, gfp_t gfp_mask, struct bio_set *bs) { gfp_t saved_gfp = gfp_mask; struct bio *bio; void *p; /* should not use nobvec bioset for nr_vecs > 0 */ if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && nr_vecs > 0)) return NULL; if (bs->cache && nr_vecs <= BIO_INLINE_VECS) { opf |= REQ_ALLOC_CACHE; bio = bio_alloc_percpu_cache(bdev, nr_vecs, opf, gfp_mask, bs); if (bio) return bio; /* * No cached bio available, bio returned below marked with * REQ_ALLOC_CACHE to participate in per-cpu alloc cache. */ } else opf &= ~REQ_ALLOC_CACHE; /* * submit_bio_noacct() converts recursion to iteration; this means if * we're running beneath it, any bios we allocate and submit will not be * submitted (and thus freed) until after we return. * * This exposes us to a potential deadlock if we allocate multiple bios * from the same bio_set() while running underneath submit_bio_noacct(). * If we were to allocate multiple bios (say a stacking block driver * that was splitting bios), we would deadlock if we exhausted the * mempool's reserve. * * We solve this, and guarantee forward progress, with a rescuer * workqueue per bio_set. If we go to allocate and there are bios on * current->bio_list, we first try the allocation without * __GFP_DIRECT_RECLAIM; if that fails, we punt those bios we would be * blocking to the rescuer workqueue before we retry with the original * gfp_flags. */ if (current->bio_list && (!bio_list_empty(&current->bio_list[0]) || !bio_list_empty(&current->bio_list[1])) && bs->rescue_workqueue) gfp_mask &= ~__GFP_DIRECT_RECLAIM; p = mempool_alloc(&bs->bio_pool, gfp_mask); if (!p && gfp_mask != saved_gfp) { punt_bios_to_rescuer(bs); gfp_mask = saved_gfp; p = mempool_alloc(&bs->bio_pool, gfp_mask); } if (unlikely(!p)) return NULL; if (!mempool_is_saturated(&bs->bio_pool)) opf &= ~REQ_ALLOC_CACHE; bio = p + bs->front_pad; if (nr_vecs > BIO_INLINE_VECS) { struct bio_vec *bvl = NULL; bvl = bvec_alloc(&bs->bvec_pool, &nr_vecs, gfp_mask); if (!bvl && gfp_mask != saved_gfp) { punt_bios_to_rescuer(bs); gfp_mask = saved_gfp; bvl = bvec_alloc(&bs->bvec_pool, &nr_vecs, gfp_mask); } if (unlikely(!bvl)) goto err_free; bio_init(bio, bdev, bvl, nr_vecs, opf); } else if (nr_vecs) { bio_init_inline(bio, bdev, BIO_INLINE_VECS, opf); } else { bio_init(bio, bdev, NULL, 0, opf); } bio->bi_pool = bs; return bio; err_free: mempool_free(p, &bs->bio_pool); return NULL; } EXPORT_SYMBOL(bio_alloc_bioset); /** * bio_kmalloc - kmalloc a bio * @nr_vecs: number of bio_vecs to allocate * @gfp_mask: the GFP_* mask given to the slab allocator * * Use kmalloc to allocate a bio (including bvecs). The bio must be initialized * using bio_init() before use. To free a bio returned from this function use * kfree() after calling bio_uninit(). A bio returned from this function can * be reused by calling bio_uninit() before calling bio_init() again. * * Note that unlike bio_alloc() or bio_alloc_bioset() allocations from this * function are not backed by a mempool can fail. Do not use this function * for allocations in the file system I/O path. * * Returns: Pointer to new bio on success, NULL on failure. */ struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask) { struct bio *bio; if (nr_vecs > BIO_MAX_INLINE_VECS) return NULL; return kmalloc(sizeof(*bio) + nr_vecs * sizeof(struct bio_vec), gfp_mask); } EXPORT_SYMBOL(bio_kmalloc); void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start) { struct bio_vec bv; struct bvec_iter iter; __bio_for_each_segment(bv, bio, iter, start) memzero_bvec(&bv); } EXPORT_SYMBOL(zero_fill_bio_iter); /** * bio_truncate - truncate the bio to small size of @new_size * @bio: the bio to be truncated * @new_size: new size for truncating the bio * * Description: * Truncate the bio to new size of @new_size. If bio_op(bio) is * REQ_OP_READ, zero the truncated part. This function should only * be used for handling corner cases, such as bio eod. */ static void bio_truncate(struct bio *bio, unsigned new_size) { struct bio_vec bv; struct bvec_iter iter; unsigned int done = 0; bool truncated = false; if (new_size >= bio->bi_iter.bi_size) return; if (bio_op(bio) != REQ_OP_READ) goto exit; bio_for_each_segment(bv, bio, iter) { if (done + bv.bv_len > new_size) { size_t offset; if (!truncated) offset = new_size - done; else offset = 0; memzero_page(bv.bv_page, bv.bv_offset + offset, bv.bv_len - offset); truncated = true; } done += bv.bv_len; } exit: /* * Don't touch bvec table here and make it really immutable, since * fs bio user has to retrieve all pages via bio_for_each_segment_all * in its .end_bio() callback. * * It is enough to truncate bio by updating .bi_size since we can make * correct bvec with the updated .bi_size for drivers. */ bio->bi_iter.bi_size = new_size; } /** * guard_bio_eod - truncate a BIO to fit the block device * @bio: bio to truncate * * This allows us to do IO even on the odd last sectors of a device, even if the * block size is some multiple of the physical sector size. * * We'll just truncate the bio to the size of the device, and clear the end of * the buffer head manually. Truly out-of-range accesses will turn into actual * I/O errors, this only handles the "we need to be able to do I/O at the final * sector" case. */ void guard_bio_eod(struct bio *bio) { sector_t maxsector = bdev_nr_sectors(bio->bi_bdev); if (!maxsector) return; /* * If the *whole* IO is past the end of the device, * let it through, and the IO layer will turn it into * an EIO. */ if (unlikely(bio->bi_iter.bi_sector >= maxsector)) return; maxsector -= bio->bi_iter.bi_sector; if (likely((bio->bi_iter.bi_size >> 9) <= maxsector)) return; bio_truncate(bio, maxsector << 9); } static int __bio_alloc_cache_prune(struct bio_alloc_cache *cache, unsigned int nr) { unsigned int i = 0; struct bio *bio; while ((bio = cache->free_list) != NULL) { cache->free_list = bio->bi_next; cache->nr--; bio_free(bio); if (++i == nr) break; } return i; } static void bio_alloc_cache_prune(struct bio_alloc_cache *cache, unsigned int nr) { nr -= __bio_alloc_cache_prune(cache, nr); if (!READ_ONCE(cache->free_list)) { bio_alloc_irq_cache_splice(cache); __bio_alloc_cache_prune(cache, nr); } } static int bio_cpu_dead(unsigned int cpu, struct hlist_node *node) { struct bio_set *bs; bs = hlist_entry_safe(node, struct bio_set, cpuhp_dead); if (bs->cache) { struct bio_alloc_cache *cache = per_cpu_ptr(bs->cache, cpu); bio_alloc_cache_prune(cache, -1U); } return 0; } static void bio_alloc_cache_destroy(struct bio_set *bs) { int cpu; if (!bs->cache) return; cpuhp_state_remove_instance_nocalls(CPUHP_BIO_DEAD, &bs->cpuhp_dead); for_each_possible_cpu(cpu) { struct bio_alloc_cache *cache; cache = per_cpu_ptr(bs->cache, cpu); bio_alloc_cache_prune(cache, -1U); } free_percpu(bs->cache); bs->cache = NULL; } static inline void bio_put_percpu_cache(struct bio *bio) { struct bio_alloc_cache *cache; cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu()); if (READ_ONCE(cache->nr_irq) + cache->nr > ALLOC_CACHE_MAX) goto out_free; if (in_task()) { bio_uninit(bio); bio->bi_next = cache->free_list; /* Not necessary but helps not to iopoll already freed bios */ bio->bi_bdev = NULL; cache->free_list = bio; cache->nr++; } else if (in_hardirq()) { lockdep_assert_irqs_disabled(); bio_uninit(bio); bio->bi_next = cache->free_list_irq; cache->free_list_irq = bio; cache->nr_irq++; } else { goto out_free; } put_cpu(); return; out_free: put_cpu(); bio_free(bio); } /** * bio_put - release a reference to a bio * @bio: bio to release reference to * * Description: * Put a reference to a &struct bio, either one you have gotten with * bio_alloc, bio_get or bio_clone_*. The last put of a bio will free it. **/ void bio_put(struct bio *bio) { if (unlikely(bio_flagged(bio, BIO_REFFED))) { BUG_ON(!atomic_read(&bio->__bi_cnt)); if (!atomic_dec_and_test(&bio->__bi_cnt)) return; } if (bio->bi_opf & REQ_ALLOC_CACHE) bio_put_percpu_cache(bio); else bio_free(bio); } EXPORT_SYMBOL(bio_put); static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp) { bio_set_flag(bio, BIO_CLONED); bio->bi_ioprio = bio_src->bi_ioprio; bio->bi_write_hint = bio_src->bi_write_hint; bio->bi_write_stream = bio_src->bi_write_stream; bio->bi_iter = bio_src->bi_iter; if (bio->bi_bdev) { if (bio->bi_bdev == bio_src->bi_bdev && bio_flagged(bio_src, BIO_REMAPPED)) bio_set_flag(bio, BIO_REMAPPED); bio_clone_blkg_association(bio, bio_src); } if (bio_crypt_clone(bio, bio_src, gfp) < 0) return -ENOMEM; if (bio_integrity(bio_src) && bio_integrity_clone(bio, bio_src, gfp) < 0) return -ENOMEM; return 0; } /** * bio_alloc_clone - clone a bio that shares the original bio's biovec * @bdev: block_device to clone onto * @bio_src: bio to clone from * @gfp: allocation priority * @bs: bio_set to allocate from * * Allocate a new bio that is a clone of @bio_src. The caller owns the returned * bio, but not the actual data it points to. * * The caller must ensure that the return bio is not freed before @bio_src. */ struct bio *bio_alloc_clone(struct block_device *bdev, struct bio *bio_src, gfp_t gfp, struct bio_set *bs) { struct bio *bio; bio = bio_alloc_bioset(bdev, 0, bio_src->bi_opf, gfp, bs); if (!bio) return NULL; if (__bio_clone(bio, bio_src, gfp) < 0) { bio_put(bio); return NULL; } bio->bi_io_vec = bio_src->bi_io_vec; return bio; } EXPORT_SYMBOL(bio_alloc_clone); /** * bio_init_clone - clone a bio that shares the original bio's biovec * @bdev: block_device to clone onto * @bio: bio to clone into * @bio_src: bio to clone from * @gfp: allocation priority * * Initialize a new bio in caller provided memory that is a clone of @bio_src. * The caller owns the returned bio, but not the actual data it points to. * * The caller must ensure that @bio_src is not freed before @bio. */ int bio_init_clone(struct block_device *bdev, struct bio *bio, struct bio *bio_src, gfp_t gfp) { int ret; bio_init(bio, bdev, bio_src->bi_io_vec, 0, bio_src->bi_opf); ret = __bio_clone(bio, bio_src, gfp); if (ret) bio_uninit(bio); return ret; } EXPORT_SYMBOL(bio_init_clone); /** * bio_full - check if the bio is full * @bio: bio to check * @len: length of one segment to be added * * Return true if @bio is full and one segment with @len bytes can't be * added to the bio, otherwise return false */ static inline bool bio_full(struct bio *bio, unsigned len) { if (bio->bi_vcnt >= bio->bi_max_vecs) return true; if (bio->bi_iter.bi_size > UINT_MAX - len) return true; return false; } static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page, unsigned int len, unsigned int off) { size_t bv_end = bv->bv_offset + bv->bv_len; phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) + bv_end - 1; phys_addr_t page_addr = page_to_phys(page); if (vec_end_addr + 1 != page_addr + off) return false; if (xen_domain() && !xen_biovec_phys_mergeable(bv, page)) return false; if ((vec_end_addr & PAGE_MASK) != ((page_addr + off) & PAGE_MASK)) { if (IS_ENABLED(CONFIG_KMSAN)) return false; if (bv->bv_page + bv_end / PAGE_SIZE != page + off / PAGE_SIZE) return false; } bv->bv_len += len; return true; } /* * Try to merge a page into a segment, while obeying the hardware segment * size limit. * * This is kept around for the integrity metadata, which is still tries * to build the initial bio to the hardware limit and doesn't have proper * helpers to split. Hopefully this will go away soon. */ bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv, struct page *page, unsigned len, unsigned offset) { unsigned long mask = queue_segment_boundary(q); phys_addr_t addr1 = bvec_phys(bv); phys_addr_t addr2 = page_to_phys(page) + offset + len - 1; if ((addr1 | mask) != (addr2 | mask)) return false; if (len > queue_max_segment_size(q) - bv->bv_len) return false; return bvec_try_merge_page(bv, page, len, offset); } /** * __bio_add_page - add page(s) to a bio in a new segment * @bio: destination bio * @page: start page to add * @len: length of the data to add, may cross pages * @off: offset of the data relative to @page, may cross pages * * Add the data at @page + @off to @bio as a new bvec. The caller must ensure * that @bio has space for another bvec. */ void __bio_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int off) { WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)); WARN_ON_ONCE(bio_full(bio, len)); if (is_pci_p2pdma_page(page)) bio->bi_opf |= REQ_NOMERGE; bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, off); bio->bi_iter.bi_size += len; bio->bi_vcnt++; } EXPORT_SYMBOL_GPL(__bio_add_page); /** * bio_add_virt_nofail - add data in the direct kernel mapping to a bio * @bio: destination bio * @vaddr: data to add * @len: length of the data to add, may cross pages * * Add the data at @vaddr to @bio. The caller must have ensure a segment * is available for the added data. No merging into an existing segment * will be performed. */ void bio_add_virt_nofail(struct bio *bio, void *vaddr, unsigned len) { __bio_add_page(bio, virt_to_page(vaddr), len, offset_in_page(vaddr)); } EXPORT_SYMBOL_GPL(bio_add_virt_nofail); /** * bio_add_page - attempt to add page(s) to bio * @bio: destination bio * @page: start page to add * @len: vec entry length, may cross pages * @offset: vec entry offset relative to @page, may cross pages * * Attempt to add page(s) to the bio_vec maplist. This will only fail * if either bio->bi_vcnt == bio->bi_max_vecs or it's a cloned bio. */ int bio_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset) { if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) return 0; if (bio->bi_iter.bi_size > UINT_MAX - len) return 0; if (bio->bi_vcnt > 0) { struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; if (!zone_device_pages_have_same_pgmap(bv->bv_page, page)) return 0; if (bvec_try_merge_page(bv, page, len, offset)) { bio->bi_iter.bi_size += len; return len; } } if (bio->bi_vcnt >= bio->bi_max_vecs) return 0; __bio_add_page(bio, page, len, offset); return len; } EXPORT_SYMBOL(bio_add_page); void bio_add_folio_nofail(struct bio *bio, struct folio *folio, size_t len, size_t off) { unsigned long nr = off / PAGE_SIZE; WARN_ON_ONCE(len > UINT_MAX); __bio_add_page(bio, folio_page(folio, nr), len, off % PAGE_SIZE); } EXPORT_SYMBOL_GPL(bio_add_folio_nofail); /** * bio_add_folio - Attempt to add part of a folio to a bio. * @bio: BIO to add to. * @folio: Folio to add. * @len: How many bytes from the folio to add. * @off: First byte in this folio to add. * * Filesystems that use folios can call this function instead of calling * bio_add_page() for each page in the folio. If @off is bigger than * PAGE_SIZE, this function can create a bio_vec that starts in a page * after the bv_page. BIOs do not support folios that are 4GiB or larger. * * Return: Whether the addition was successful. */ bool bio_add_folio(struct bio *bio, struct folio *folio, size_t len, size_t off) { unsigned long nr = off / PAGE_SIZE; if (len > UINT_MAX) return false; return bio_add_page(bio, folio_page(folio, nr), len, off % PAGE_SIZE) > 0; } EXPORT_SYMBOL(bio_add_folio); /** * bio_add_vmalloc_chunk - add a vmalloc chunk to a bio * @bio: destination bio * @vaddr: vmalloc address to add * @len: total length in bytes of the data to add * * Add data starting at @vaddr to @bio and return how many bytes were added. * This may be less than the amount originally asked. Returns 0 if no data * could be added to @bio. * * This helper calls flush_kernel_vmap_range() for the range added. For reads * the caller still needs to manually call invalidate_kernel_vmap_range() in * the completion handler. */ unsigned int bio_add_vmalloc_chunk(struct bio *bio, void *vaddr, unsigned len) { unsigned int offset = offset_in_page(vaddr); len = min(len, PAGE_SIZE - offset); if (bio_add_page(bio, vmalloc_to_page(vaddr), len, offset) < len) return 0; if (op_is_write(bio_op(bio))) flush_kernel_vmap_range(vaddr, len); return len; } EXPORT_SYMBOL_GPL(bio_add_vmalloc_chunk); /** * bio_add_vmalloc - add a vmalloc region to a bio * @bio: destination bio * @vaddr: vmalloc address to add * @len: total length in bytes of the data to add * * Add data starting at @vaddr to @bio. Return %true on success or %false if * @bio does not have enough space for the payload. * * This helper calls flush_kernel_vmap_range() for the range added. For reads * the caller still needs to manually call invalidate_kernel_vmap_range() in * the completion handler. */ bool bio_add_vmalloc(struct bio *bio, void *vaddr, unsigned int len) { do { unsigned int added = bio_add_vmalloc_chunk(bio, vaddr, len); if (!added) return false; vaddr += added; len -= added; } while (len); return true; } EXPORT_SYMBOL_GPL(bio_add_vmalloc); void __bio_release_pages(struct bio *bio, bool mark_dirty) { struct folio_iter fi; bio_for_each_folio_all(fi, bio) { size_t nr_pages; if (mark_dirty) { folio_lock(fi.folio); folio_mark_dirty(fi.folio); folio_unlock(fi.folio); } nr_pages = (fi.offset + fi.length - 1) / PAGE_SIZE - fi.offset / PAGE_SIZE + 1; unpin_user_folio(fi.folio, nr_pages); } } EXPORT_SYMBOL_GPL(__bio_release_pages); void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter) { WARN_ON_ONCE(bio->bi_max_vecs); bio->bi_vcnt = iter->nr_segs; bio->bi_io_vec = (struct bio_vec *)iter->bvec; bio->bi_iter.bi_bvec_done = iter->iov_offset; bio->bi_iter.bi_size = iov_iter_count(iter); bio_set_flag(bio, BIO_CLONED); } static unsigned int get_contig_folio_len(unsigned int *num_pages, struct page **pages, unsigned int i, struct folio *folio, size_t left, size_t offset) { size_t bytes = left; size_t contig_sz = min_t(size_t, PAGE_SIZE - offset, bytes); unsigned int j; /* * We might COW a single page in the middle of * a large folio, so we have to check that all * pages belong to the same folio. */ bytes -= contig_sz; for (j = i + 1; j < i + *num_pages; j++) { size_t next = min_t(size_t, PAGE_SIZE, bytes); if (page_folio(pages[j]) != folio || pages[j] != pages[j - 1] + 1) { break; } contig_sz += next; bytes -= next; } *num_pages = j - i; return contig_sz; } #define PAGE_PTRS_PER_BVEC (sizeof(struct bio_vec) / sizeof(struct page *)) /** * __bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio * @bio: bio to add pages to * @iter: iov iterator describing the region to be mapped * * Extracts pages from *iter and appends them to @bio's bvec array. The pages * will have to be cleaned up in the way indicated by the BIO_PAGE_PINNED flag. * For a multi-segment *iter, this function only adds pages from the next * non-empty segment of the iov iterator. */ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) { iov_iter_extraction_t extraction_flags = 0; unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt; unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt; struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt; struct page **pages = (struct page **)bv; ssize_t size; unsigned int num_pages, i = 0; size_t offset, folio_offset, left, len; int ret = 0; /* * Move page array up in the allocated memory for the bio vecs as far as * possible so that we can start filling biovecs from the beginning * without overwriting the temporary page array. */ BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2); pages += entries_left * (PAGE_PTRS_PER_BVEC - 1); if (bio->bi_bdev && blk_queue_pci_p2pdma(bio->bi_bdev->bd_disk->queue)) extraction_flags |= ITER_ALLOW_P2PDMA; size = iov_iter_extract_pages(iter, &pages, UINT_MAX - bio->bi_iter.bi_size, nr_pages, extraction_flags, &offset); if (unlikely(size <= 0)) return size ? size : -EFAULT; nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE); for (left = size, i = 0; left > 0; left -= len, i += num_pages) { struct page *page = pages[i]; struct folio *folio = page_folio(page); unsigned int old_vcnt = bio->bi_vcnt; folio_offset = ((size_t)folio_page_idx(folio, page) << PAGE_SHIFT) + offset; len = min(folio_size(folio) - folio_offset, left); num_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE); if (num_pages > 1) len = get_contig_folio_len(&num_pages, pages, i, folio, left, offset); if (!bio_add_folio(bio, folio, len, folio_offset)) { WARN_ON_ONCE(1); ret = -EINVAL; goto out; } if (bio_flagged(bio, BIO_PAGE_PINNED)) { /* * We're adding another fragment of a page that already * was part of the last segment. Undo our pin as the * page was pinned when an earlier fragment of it was * added to the bio and __bio_release_pages expects a * single pin per page. */ if (offset && bio->bi_vcnt == old_vcnt) unpin_user_folio(folio, 1); } offset = 0; } iov_iter_revert(iter, left); out: while (i < nr_pages) bio_release_page(bio, pages[i++]); return ret; } /* * Aligns the bio size to the len_align_mask, releasing excessive bio vecs that * __bio_iov_iter_get_pages may have inserted, and reverts the trimmed length * for the next iteration. */ static int bio_iov_iter_align_down(struct bio *bio, struct iov_iter *iter, unsigned len_align_mask) { size_t nbytes = bio->bi_iter.bi_size & len_align_mask; if (!nbytes) return 0; iov_iter_revert(iter, nbytes); bio->bi_iter.bi_size -= nbytes; do { struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; if (nbytes < bv->bv_len) { bv->bv_len -= nbytes; break; } bio_release_page(bio, bv->bv_page); bio->bi_vcnt--; nbytes -= bv->bv_len; } while (nbytes); if (!bio->bi_vcnt) return -EFAULT; return 0; } /** * bio_iov_iter_get_pages - add user or kernel pages to a bio * @bio: bio to add pages to * @iter: iov iterator describing the region to be added * @len_align_mask: the mask to align the total size to, 0 for any length * * This takes either an iterator pointing to user memory, or one pointing to * kernel pages (BVEC iterator). If we're adding user pages, we pin them and * map them into the kernel. On IO completion, the caller should put those * pages. For bvec based iterators bio_iov_iter_get_pages() uses the provided * bvecs rather than copying them. Hence anyone issuing kiocb based IO needs * to ensure the bvecs and pages stay referenced until the submitted I/O is * completed by a call to ->ki_complete() or returns with an error other than * -EIOCBQUEUED. The caller needs to check if the bio is flagged BIO_NO_PAGE_REF * on IO completion. If it isn't, then pages should be released. * * The function tries, but does not guarantee, to pin as many pages as * fit into the bio, or are requested in @iter, whatever is smaller. If * MM encounters an error pinning the requested pages, it stops. Error * is returned only if 0 pages could be pinned. */ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter, unsigned len_align_mask) { int ret = 0; if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) return -EIO; if (iov_iter_is_bvec(iter)) { bio_iov_bvec_set(bio, iter); iov_iter_advance(iter, bio->bi_iter.bi_size); return 0; } if (iov_iter_extract_will_pin(iter)) bio_set_flag(bio, BIO_PAGE_PINNED); do { ret = __bio_iov_iter_get_pages(bio, iter); } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0)); if (bio->bi_vcnt) return bio_iov_iter_align_down(bio, iter, len_align_mask); return ret; } static void submit_bio_wait_endio(struct bio *bio) { complete(bio->bi_private); } /** * submit_bio_wait - submit a bio, and wait until it completes * @bio: The &struct bio which describes the I/O * * Simple wrapper around submit_bio(). Returns 0 on success, or the error from * bio_endio() on failure. * * WARNING: Unlike to how submit_bio() is usually used, this function does not * result in bio reference to be consumed. The caller must drop the reference * on his own. */ int submit_bio_wait(struct bio *bio) { DECLARE_COMPLETION_ONSTACK_MAP(done, bio->bi_bdev->bd_disk->lockdep_map); bio->bi_private = &done; bio->bi_end_io = submit_bio_wait_endio; bio->bi_opf |= REQ_SYNC; submit_bio(bio); blk_wait_io(&done); return blk_status_to_errno(bio->bi_status); } EXPORT_SYMBOL(submit_bio_wait); /** * bdev_rw_virt - synchronously read into / write from kernel mapping * @bdev: block device to access * @sector: sector to access * @data: data to read/write * @len: length in byte to read/write * @op: operation (e.g. REQ_OP_READ/REQ_OP_WRITE) * * Performs synchronous I/O to @bdev for @data/@len. @data must be in * the kernel direct mapping and not a vmalloc address. */ int bdev_rw_virt(struct block_device *bdev, sector_t sector, void *data, size_t len, enum req_op op) { struct bio_vec bv; struct bio bio; int error; if (WARN_ON_ONCE(is_vmalloc_addr(data))) return -EIO; bio_init(&bio, bdev, &bv, 1, op); bio.bi_iter.bi_sector = sector; bio_add_virt_nofail(&bio, data, len); error = submit_bio_wait(&bio); bio_uninit(&bio); return error; } EXPORT_SYMBOL_GPL(bdev_rw_virt); static void bio_wait_end_io(struct bio *bio) { complete(bio->bi_private); bio_put(bio); } /* * bio_await_chain - ends @bio and waits for every chained bio to complete */ void bio_await_chain(struct bio *bio) { DECLARE_COMPLETION_ONSTACK_MAP(done, bio->bi_bdev->bd_disk->lockdep_map); bio->bi_private = &done; bio->bi_end_io = bio_wait_end_io; bio_endio(bio); blk_wait_io(&done); } void __bio_advance(struct bio *bio, unsigned bytes) { if (bio_integrity(bio)) bio_integrity_advance(bio, bytes); bio_crypt_advance(bio, bytes); bio_advance_iter(bio, &bio->bi_iter, bytes); } EXPORT_SYMBOL(__bio_advance); void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, struct bio *src, struct bvec_iter *src_iter) { while (src_iter->bi_size && dst_iter->bi_size) { struct bio_vec src_bv = bio_iter_iovec(src, *src_iter); struct bio_vec dst_bv = bio_iter_iovec(dst, *dst_iter); unsigned int bytes = min(src_bv.bv_len, dst_bv.bv_len); void *src_buf = bvec_kmap_local(&src_bv); void *dst_buf = bvec_kmap_local(&dst_bv); memcpy(dst_buf, src_buf, bytes); kunmap_local(dst_buf); kunmap_local(src_buf); bio_advance_iter_single(src, src_iter, bytes); bio_advance_iter_single(dst, dst_iter, bytes); } } EXPORT_SYMBOL(bio_copy_data_iter); /** * bio_copy_data - copy contents of data buffers from one bio to another * @src: source bio * @dst: destination bio * * Stops when it reaches the end of either @src or @dst - that is, copies * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios). */ void bio_copy_data(struct bio *dst, struct bio *src) { struct bvec_iter src_iter = src->bi_iter; struct bvec_iter dst_iter = dst->bi_iter; bio_copy_data_iter(dst, &dst_iter, src, &src_iter); } EXPORT_SYMBOL(bio_copy_data); void bio_free_pages(struct bio *bio) { struct bio_vec *bvec; struct bvec_iter_all iter_all; bio_for_each_segment_all(bvec, bio, iter_all) __free_page(bvec->bv_page); } EXPORT_SYMBOL(bio_free_pages); /* * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions * for performing direct-IO in BIOs. * * The problem is that we cannot run folio_mark_dirty() from interrupt context * because the required locks are not interrupt-safe. So what we can do is to * mark the pages dirty _before_ performing IO. And in interrupt context, * check that the pages are still dirty. If so, fine. If not, redirty them * in process context. * * Note that this code is very hard to test under normal circumstances because * direct-io pins the pages with get_user_pages(). This makes * is_page_cache_freeable return false, and the VM will not clean the pages. * But other code (eg, flusher threads) could clean the pages if they are mapped * pagecache. * * Simply disabling the call to bio_set_pages_dirty() is a good way to test the * deferred bio dirtying paths. */ /* * bio_set_pages_dirty() will mark all the bio's pages as dirty. */ void bio_set_pages_dirty(struct bio *bio) { struct folio_iter fi; bio_for_each_folio_all(fi, bio) { folio_lock(fi.folio); folio_mark_dirty(fi.folio); folio_unlock(fi.folio); } } EXPORT_SYMBOL_GPL(bio_set_pages_dirty); /* * bio_check_pages_dirty() will check that all the BIO's pages are still dirty. * If they are, then fine. If, however, some pages are clean then they must * have been written out during the direct-IO read. So we take another ref on * the BIO and re-dirty the pages in process context. * * It is expected that bio_check_pages_dirty() will wholly own the BIO from * here on. It will unpin each page and will run one bio_put() against the * BIO. */ static void bio_dirty_fn(struct work_struct *work); static DECLARE_WORK(bio_dirty_work, bio_dirty_fn); static DEFINE_SPINLOCK(bio_dirty_lock); static struct bio *bio_dirty_list; /* * This runs in process context */ static void bio_dirty_fn(struct work_struct *work) { struct bio *bio, *next; spin_lock_irq(&bio_dirty_lock); next = bio_dirty_list; bio_dirty_list = NULL; spin_unlock_irq(&bio_dirty_lock); while ((bio = next) != NULL) { next = bio->bi_private; bio_release_pages(bio, true); bio_put(bio); } } void bio_check_pages_dirty(struct bio *bio) { struct folio_iter fi; unsigned long flags; bio_for_each_folio_all(fi, bio) { if (!folio_test_dirty(fi.folio)) goto defer; } bio_release_pages(bio, false); bio_put(bio); return; defer: spin_lock_irqsave(&bio_dirty_lock, flags); bio->bi_private = bio_dirty_list; bio_dirty_list = bio; spin_unlock_irqrestore(&bio_dirty_lock, flags); schedule_work(&bio_dirty_work); } EXPORT_SYMBOL_GPL(bio_check_pages_dirty); static inline bool bio_remaining_done(struct bio *bio) { /* * If we're not chaining, then ->__bi_remaining is always 1 and * we always end io on the first invocation. */ if (!bio_flagged(bio, BIO_CHAIN)) return true; BUG_ON(atomic_read(&bio->__bi_remaining) <= 0); if (atomic_dec_and_test(&bio->__bi_remaining)) { bio_clear_flag(bio, BIO_CHAIN); return true; } return false; } /** * bio_endio - end I/O on a bio * @bio: bio * * Description: * bio_endio() will end I/O on the whole bio. bio_endio() is the preferred * way to end I/O on a bio. No one should call bi_end_io() directly on a * bio unless they own it and thus know that it has an end_io function. * * bio_endio() can be called several times on a bio that has been chained * using bio_chain(). The ->bi_end_io() function will only be called the * last time. **/ void bio_endio(struct bio *bio) { again: if (!bio_remaining_done(bio)) return; if (!bio_integrity_endio(bio)) return; blk_zone_bio_endio(bio); rq_qos_done_bio(bio); if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) { trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), bio); bio_clear_flag(bio, BIO_TRACE_COMPLETION); } /* * Need to have a real endio function for chained bios, otherwise * various corner cases will break (like stacking block devices that * save/restore bi_end_io) - however, we want to avoid unbounded * recursion and blowing the stack. Tail call optimization would * handle this, but compiling with frame pointers also disables * gcc's sibling call optimization. */ if (bio->bi_end_io == bio_chain_endio) { bio = __bio_chain_endio(bio); goto again; } #ifdef CONFIG_BLK_CGROUP /* * Release cgroup info. We shouldn't have to do this here, but quite * a few callers of bio_init fail to call bio_uninit, so we cover up * for that here at least for now. */ if (bio->bi_blkg) { blkg_put(bio->bi_blkg); bio->bi_blkg = NULL; } #endif if (bio->bi_end_io) bio->bi_end_io(bio); } EXPORT_SYMBOL(bio_endio); /** * bio_split - split a bio * @bio: bio to split * @sectors: number of sectors to split from the front of @bio * @gfp: gfp mask * @bs: bio set to allocate from * * Allocates and returns a new bio which represents @sectors from the start of * @bio, and updates @bio to represent the remaining sectors. * * Unless this is a discard request the newly allocated bio will point * to @bio's bi_io_vec. It is the caller's responsibility to ensure that * neither @bio nor @bs are freed before the split bio. */ struct bio *bio_split(struct bio *bio, int sectors, gfp_t gfp, struct bio_set *bs) { struct bio *split; if (WARN_ON_ONCE(sectors <= 0)) return ERR_PTR(-EINVAL); if (WARN_ON_ONCE(sectors >= bio_sectors(bio))) return ERR_PTR(-EINVAL); /* Zone append commands cannot be split */ if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND)) return ERR_PTR(-EINVAL); /* atomic writes cannot be split */ if (bio->bi_opf & REQ_ATOMIC) return ERR_PTR(-EINVAL); split = bio_alloc_clone(bio->bi_bdev, bio, gfp, bs); if (!split) return ERR_PTR(-ENOMEM); split->bi_iter.bi_size = sectors << 9; if (bio_integrity(split)) bio_integrity_trim(split); bio_advance(bio, split->bi_iter.bi_size); if (bio_flagged(bio, BIO_TRACE_COMPLETION)) bio_set_flag(split, BIO_TRACE_COMPLETION); return split; } EXPORT_SYMBOL(bio_split); /** * bio_trim - trim a bio * @bio: bio to trim * @offset: number of sectors to trim from the front of @bio * @size: size we want to trim @bio to, in sectors * * This function is typically used for bios that are cloned and submitted * to the underlying device in parts. */ void bio_trim(struct bio *bio, sector_t offset, sector_t size) { /* We should never trim an atomic write */ if (WARN_ON_ONCE(bio->bi_opf & REQ_ATOMIC && size)) return; if (WARN_ON_ONCE(offset > BIO_MAX_SECTORS || size > BIO_MAX_SECTORS || offset + size > bio_sectors(bio))) return; size <<= 9; if (offset == 0 && size == bio->bi_iter.bi_size) return; bio_advance(bio, offset << 9); bio->bi_iter.bi_size = size; if (bio_integrity(bio)) bio_integrity_trim(bio); } EXPORT_SYMBOL_GPL(bio_trim); /* * create memory pools for biovec's in a bio_set. * use the global biovec slabs created for general use. */ int biovec_init_pool(mempool_t *pool, int pool_entries) { struct biovec_slab *bp = bvec_slabs + ARRAY_SIZE(bvec_slabs) - 1; return mempool_init_slab_pool(pool, pool_entries, bp->slab); } /* * bioset_exit - exit a bioset initialized with bioset_init() * * May be called on a zeroed but uninitialized bioset (i.e. allocated with * kzalloc()). */ void bioset_exit(struct bio_set *bs) { bio_alloc_cache_destroy(bs); if (bs->rescue_workqueue) destroy_workqueue(bs->rescue_workqueue); bs->rescue_workqueue = NULL; mempool_exit(&bs->bio_pool); mempool_exit(&bs->bvec_pool); if (bs->bio_slab) bio_put_slab(bs); bs->bio_slab = NULL; } EXPORT_SYMBOL(bioset_exit); /** * bioset_init - Initialize a bio_set * @bs: pool to initialize * @pool_size: Number of bio and bio_vecs to cache in the mempool * @front_pad: Number of bytes to allocate in front of the returned bio * @flags: Flags to modify behavior, currently %BIOSET_NEED_BVECS * and %BIOSET_NEED_RESCUER * * Description: * Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller * to ask for a number of bytes to be allocated in front of the bio. * Front pad allocation is useful for embedding the bio inside * another structure, to avoid allocating extra data to go with the bio. * Note that the bio must be embedded at the END of that structure always, * or things will break badly. * If %BIOSET_NEED_BVECS is set in @flags, a separate pool will be allocated * for allocating iovecs. This pool is not needed e.g. for bio_init_clone(). * If %BIOSET_NEED_RESCUER is set, a workqueue is created which can be used * to dispatch queued requests when the mempool runs out of space. * */ int bioset_init(struct bio_set *bs, unsigned int pool_size, unsigned int front_pad, int flags) { bs->front_pad = front_pad; if (flags & BIOSET_NEED_BVECS) bs->back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec); else bs->back_pad = 0; spin_lock_init(&bs->rescue_lock); bio_list_init(&bs->rescue_list); INIT_WORK(&bs->rescue_work, bio_alloc_rescue); bs->bio_slab = bio_find_or_create_slab(bs); if (!bs->bio_slab) return -ENOMEM; if (mempool_init_slab_pool(&bs->bio_pool, pool_size, bs->bio_slab)) goto bad; if ((flags & BIOSET_NEED_BVECS) && biovec_init_pool(&bs->bvec_pool, pool_size)) goto bad; if (flags & BIOSET_NEED_RESCUER) { bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0); if (!bs->rescue_workqueue) goto bad; } if (flags & BIOSET_PERCPU_CACHE) { bs->cache = alloc_percpu(struct bio_alloc_cache); if (!bs->cache) goto bad; cpuhp_state_add_instance_nocalls(CPUHP_BIO_DEAD, &bs->cpuhp_dead); } return 0; bad: bioset_exit(bs); return -ENOMEM; } EXPORT_SYMBOL(bioset_init); static int __init init_bio(void) { int i; BUILD_BUG_ON(BIO_FLAG_LAST > 8 * sizeof_field(struct bio, bi_flags)); for (i = 0; i < ARRAY_SIZE(bvec_slabs); i++) { struct biovec_slab *bvs = bvec_slabs + i; bvs->slab = kmem_cache_create(bvs->name, bvs->nr_vecs * sizeof(struct bio_vec), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); } cpuhp_setup_state_multi(CPUHP_BIO_DEAD, "block/bio:dead", NULL, bio_cpu_dead); if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS | BIOSET_PERCPU_CACHE)) panic("bio: can't allocate bios\n"); return 0; } subsys_initcall(init_bio);
6 4 181 6 6 81 80 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __BEN_VLAN_802_1Q_INC__ #define __BEN_VLAN_802_1Q_INC__ #include <linux/if_vlan.h> #include <linux/u64_stats_sync.h> #include <linux/list.h> /* if this changes, algorithm will have to be reworked because this * depends on completely exhausting the VLAN identifier space. Thus * it gives constant time look-up, but in many cases it wastes memory. */ #define VLAN_GROUP_ARRAY_SPLIT_PARTS 8 #define VLAN_GROUP_ARRAY_PART_LEN (VLAN_N_VID/VLAN_GROUP_ARRAY_SPLIT_PARTS) enum vlan_protos { VLAN_PROTO_8021Q = 0, VLAN_PROTO_8021AD, VLAN_PROTO_NUM, }; struct vlan_group { unsigned int nr_vlan_devs; struct hlist_node hlist; /* linked list */ struct net_device **vlan_devices_arrays[VLAN_PROTO_NUM] [VLAN_GROUP_ARRAY_SPLIT_PARTS]; }; struct vlan_info { struct net_device *real_dev; /* The ethernet(like) device * the vlan is attached to. */ struct vlan_group grp; struct list_head vid_list; unsigned int nr_vids; bool auto_vid0; struct rcu_head rcu; }; static inline int vlan_proto_idx(__be16 proto) { switch (proto) { case htons(ETH_P_8021Q): return VLAN_PROTO_8021Q; case htons(ETH_P_8021AD): return VLAN_PROTO_8021AD; default: WARN(1, "invalid VLAN protocol: 0x%04x\n", ntohs(proto)); return -EINVAL; } } static inline struct net_device *__vlan_group_get_device(struct vlan_group *vg, unsigned int pidx, u16 vlan_id) { struct net_device **array; array = vg->vlan_devices_arrays[pidx] [vlan_id / VLAN_GROUP_ARRAY_PART_LEN]; /* paired with smp_wmb() in vlan_group_prealloc_vid() */ smp_rmb(); return array ? array[vlan_id % VLAN_GROUP_ARRAY_PART_LEN] : NULL; } static inline struct net_device *vlan_group_get_device(struct vlan_group *vg, __be16 vlan_proto, u16 vlan_id) { int pidx = vlan_proto_idx(vlan_proto); if (pidx < 0) return NULL; return __vlan_group_get_device(vg, pidx, vlan_id); } static inline void vlan_group_set_device(struct vlan_group *vg, __be16 vlan_proto, u16 vlan_id, struct net_device *dev) { int pidx = vlan_proto_idx(vlan_proto); struct net_device **array; if (!vg || pidx < 0) return; array = vg->vlan_devices_arrays[pidx] [vlan_id / VLAN_GROUP_ARRAY_PART_LEN]; array[vlan_id % VLAN_GROUP_ARRAY_PART_LEN] = dev; } /* Must be invoked with rcu_read_lock or with RTNL. */ static inline struct net_device *vlan_find_dev(struct net_device *real_dev, __be16 vlan_proto, u16 vlan_id) { struct vlan_info *vlan_info = rcu_dereference_rtnl(real_dev->vlan_info); if (vlan_info) return vlan_group_get_device(&vlan_info->grp, vlan_proto, vlan_id); return NULL; } static inline netdev_features_t vlan_tnl_features(struct net_device *real_dev) { netdev_features_t ret; ret = real_dev->hw_enc_features & (NETIF_F_CSUM_MASK | NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL); if ((ret & NETIF_F_GSO_ENCAP_ALL) && (ret & NETIF_F_CSUM_MASK)) return (ret & ~NETIF_F_CSUM_MASK) | NETIF_F_HW_CSUM; return 0; } #define vlan_group_for_each_dev(grp, i, dev) \ for ((i) = 0; i < VLAN_PROTO_NUM * VLAN_N_VID; i++) \ if (((dev) = __vlan_group_get_device((grp), (i) / VLAN_N_VID, \ (i) % VLAN_N_VID))) int vlan_filter_push_vids(struct vlan_info *vlan_info, __be16 proto); void vlan_filter_drop_vids(struct vlan_info *vlan_info, __be16 proto); /* found in vlan_dev.c */ void vlan_dev_set_ingress_priority(const struct net_device *dev, u32 skb_prio, u16 vlan_prio); int vlan_dev_set_egress_priority(const struct net_device *dev, u32 skb_prio, u16 vlan_prio); void vlan_dev_free_egress_priority(const struct net_device *dev); int vlan_dev_change_flags(const struct net_device *dev, u32 flag, u32 mask); void vlan_dev_get_realdev_name(const struct net_device *dev, char *result, size_t size); int vlan_check_real_dev(struct net_device *real_dev, __be16 protocol, u16 vlan_id, struct netlink_ext_ack *extack); void vlan_setup(struct net_device *dev); int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack); void unregister_vlan_dev(struct net_device *dev, struct list_head *head); bool vlan_dev_inherit_address(struct net_device *dev, struct net_device *real_dev); static inline u32 vlan_get_ingress_priority(struct net_device *dev, u16 vlan_tci) { struct vlan_dev_priv *vip = vlan_dev_priv(dev); return vip->ingress_priority_map[(vlan_tci >> VLAN_PRIO_SHIFT) & 0x7]; } #ifdef CONFIG_VLAN_8021Q_GVRP int vlan_gvrp_request_join(const struct net_device *dev); void vlan_gvrp_request_leave(const struct net_device *dev); int vlan_gvrp_init_applicant(struct net_device *dev); void vlan_gvrp_uninit_applicant(struct net_device *dev); int vlan_gvrp_init(void); void vlan_gvrp_uninit(void); #else static inline int vlan_gvrp_request_join(const struct net_device *dev) { return 0; } static inline void vlan_gvrp_request_leave(const struct net_device *dev) {} static inline int vlan_gvrp_init_applicant(struct net_device *dev) { return 0; } static inline void vlan_gvrp_uninit_applicant(struct net_device *dev) {} static inline int vlan_gvrp_init(void) { return 0; } static inline void vlan_gvrp_uninit(void) {} #endif #ifdef CONFIG_VLAN_8021Q_MVRP int vlan_mvrp_request_join(const struct net_device *dev); void vlan_mvrp_request_leave(const struct net_device *dev); int vlan_mvrp_init_applicant(struct net_device *dev); void vlan_mvrp_uninit_applicant(struct net_device *dev); int vlan_mvrp_init(void); void vlan_mvrp_uninit(void); #else static inline int vlan_mvrp_request_join(const struct net_device *dev) { return 0; } static inline void vlan_mvrp_request_leave(const struct net_device *dev) {} static inline int vlan_mvrp_init_applicant(struct net_device *dev) { return 0; } static inline void vlan_mvrp_uninit_applicant(struct net_device *dev) {} static inline int vlan_mvrp_init(void) { return 0; } static inline void vlan_mvrp_uninit(void) {} #endif extern const char vlan_fullname[]; extern const char vlan_version[]; int vlan_netlink_init(void); void vlan_netlink_fini(void); extern struct rtnl_link_ops vlan_link_ops; extern unsigned int vlan_net_id; struct proc_dir_entry; struct vlan_net { /* /proc/net/vlan */ struct proc_dir_entry *proc_vlan_dir; /* /proc/net/vlan/config */ struct proc_dir_entry *proc_vlan_conf; /* Determines interface naming scheme. */ unsigned short name_type; }; #endif /* !(__BEN_VLAN_802_1Q_INC__) */
28 11 144 19 16 71 12 68 9 71 42 73 28 18 14 80 82 46 107 46 107 107 107 10 42 42 42 107 107 84 100 107 107 7 107 42 42 42 42 19 24 39 4 33 7 25 7 19 16 5 26 37 37 37 25 3 28 27 18 9 37 70 69 70 70 2 70 70 44 45 6 39 31 45 107 107 107 71 9 69 107 107 13 97 42 42 5 37 107 107 64 107 52 52 45 7 10 10 10 10 10 10 10 10 11 1 10 10 10 67 134 46 84 84 84 70 84 70 17 84 154 154 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 /* * net/tipc/name_table.c: TIPC name table code * * Copyright (c) 2000-2006, 2014-2018, Ericsson AB * Copyright (c) 2004-2008, 2010-2014, Wind River Systems * Copyright (c) 2020-2021, Red Hat Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <net/sock.h> #include <linux/list_sort.h> #include <linux/rbtree_augmented.h> #include "core.h" #include "netlink.h" #include "name_table.h" #include "name_distr.h" #include "subscr.h" #include "bcast.h" #include "addr.h" #include "node.h" #include "group.h" /** * struct service_range - container for all bindings of a service range * @lower: service range lower bound * @upper: service range upper bound * @tree_node: member of service range RB tree * @max: largest 'upper' in this node subtree * @local_publ: list of identical publications made from this node * Used by closest_first lookup and multicast lookup algorithm * @all_publ: all publications identical to this one, whatever node and scope * Used by round-robin lookup algorithm */ struct service_range { u32 lower; u32 upper; struct rb_node tree_node; u32 max; struct list_head local_publ; struct list_head all_publ; }; /** * struct tipc_service - container for all published instances of a service type * @type: 32 bit 'type' value for service * @publ_cnt: increasing counter for publications in this service * @ranges: rb tree containing all service ranges for this service * @service_list: links to adjacent name ranges in hash chain * @subscriptions: list of subscriptions for this service type * @lock: spinlock controlling access to pertaining service ranges/publications * @rcu: RCU callback head used for deferred freeing */ struct tipc_service { u32 type; u32 publ_cnt; struct rb_root ranges; struct hlist_node service_list; struct list_head subscriptions; spinlock_t lock; /* Covers service range list */ struct rcu_head rcu; }; #define service_range_upper(sr) ((sr)->upper) RB_DECLARE_CALLBACKS_MAX(static, sr_callbacks, struct service_range, tree_node, u32, max, service_range_upper) #define service_range_entry(rbtree_node) \ (container_of(rbtree_node, struct service_range, tree_node)) #define service_range_overlap(sr, start, end) \ ((sr)->lower <= (end) && (sr)->upper >= (start)) /** * service_range_foreach_match - iterate over tipc service rbtree for each * range match * @sr: the service range pointer as a loop cursor * @sc: the pointer to tipc service which holds the service range rbtree * @start: beginning of the search range (end >= start) for matching * @end: end of the search range (end >= start) for matching */ #define service_range_foreach_match(sr, sc, start, end) \ for (sr = service_range_match_first((sc)->ranges.rb_node, \ start, \ end); \ sr; \ sr = service_range_match_next(&(sr)->tree_node, \ start, \ end)) /** * service_range_match_first - find first service range matching a range * @n: the root node of service range rbtree for searching * @start: beginning of the search range (end >= start) for matching * @end: end of the search range (end >= start) for matching * * Return: the leftmost service range node in the rbtree that overlaps the * specific range if any. Otherwise, returns NULL. */ static struct service_range *service_range_match_first(struct rb_node *n, u32 start, u32 end) { struct service_range *sr; struct rb_node *l, *r; /* Non overlaps in tree at all? */ if (!n || service_range_entry(n)->max < start) return NULL; while (n) { l = n->rb_left; if (l && service_range_entry(l)->max >= start) { /* A leftmost overlap range node must be one in the left * subtree. If not, it has lower > end, then nodes on * the right side cannot satisfy the condition either. */ n = l; continue; } /* No one in the left subtree can match, return if this node is * an overlap i.e. leftmost. */ sr = service_range_entry(n); if (service_range_overlap(sr, start, end)) return sr; /* Ok, try to lookup on the right side */ r = n->rb_right; if (sr->lower <= end && r && service_range_entry(r)->max >= start) { n = r; continue; } break; } return NULL; } /** * service_range_match_next - find next service range matching a range * @n: a node in service range rbtree from which the searching starts * @start: beginning of the search range (end >= start) for matching * @end: end of the search range (end >= start) for matching * * Return: the next service range node to the given node in the rbtree that * overlaps the specific range if any. Otherwise, returns NULL. */ static struct service_range *service_range_match_next(struct rb_node *n, u32 start, u32 end) { struct service_range *sr; struct rb_node *p, *r; while (n) { r = n->rb_right; if (r && service_range_entry(r)->max >= start) /* A next overlap range node must be one in the right * subtree. If not, it has lower > end, then any next * successor (- an ancestor) of this node cannot * satisfy the condition either. */ return service_range_match_first(r, start, end); /* No one in the right subtree can match, go up to find an * ancestor of this node which is parent of a left-hand child. */ while ((p = rb_parent(n)) && n == p->rb_right) n = p; if (!p) break; /* Return if this ancestor is an overlap */ sr = service_range_entry(p); if (service_range_overlap(sr, start, end)) return sr; /* Ok, try to lookup more from this ancestor */ if (sr->lower <= end) { n = p; continue; } break; } return NULL; } static int hash(int x) { return x & (TIPC_NAMETBL_SIZE - 1); } /** * tipc_publ_create - create a publication structure * @ua: the service range the user is binding to * @sk: the address of the socket that is bound * @key: publication key */ static struct publication *tipc_publ_create(struct tipc_uaddr *ua, struct tipc_socket_addr *sk, u32 key) { struct publication *p = kzalloc(sizeof(*p), GFP_ATOMIC); if (!p) return NULL; p->sr = ua->sr; p->sk = *sk; p->scope = ua->scope; p->key = key; INIT_LIST_HEAD(&p->binding_sock); INIT_LIST_HEAD(&p->binding_node); INIT_LIST_HEAD(&p->local_publ); INIT_LIST_HEAD(&p->all_publ); INIT_LIST_HEAD(&p->list); return p; } /** * tipc_service_create - create a service structure for the specified 'type' * @net: network namespace * @ua: address representing the service to be bound * * Allocates a single range structure and sets it to all 0's. */ static struct tipc_service *tipc_service_create(struct net *net, struct tipc_uaddr *ua) { struct name_table *nt = tipc_name_table(net); struct tipc_service *service; struct hlist_head *hd; service = kzalloc(sizeof(*service), GFP_ATOMIC); if (!service) { pr_warn("Service creation failed, no memory\n"); return NULL; } spin_lock_init(&service->lock); service->type = ua->sr.type; service->ranges = RB_ROOT; INIT_HLIST_NODE(&service->service_list); INIT_LIST_HEAD(&service->subscriptions); hd = &nt->services[hash(ua->sr.type)]; hlist_add_head_rcu(&service->service_list, hd); return service; } /* tipc_service_find_range - find service range matching publication parameters */ static struct service_range *tipc_service_find_range(struct tipc_service *sc, struct tipc_uaddr *ua) { struct service_range *sr; service_range_foreach_match(sr, sc, ua->sr.lower, ua->sr.upper) { /* Look for exact match */ if (sr->lower == ua->sr.lower && sr->upper == ua->sr.upper) return sr; } return NULL; } static struct service_range *tipc_service_create_range(struct tipc_service *sc, struct publication *p) { struct rb_node **n, *parent = NULL; struct service_range *sr; u32 lower = p->sr.lower; u32 upper = p->sr.upper; n = &sc->ranges.rb_node; while (*n) { parent = *n; sr = service_range_entry(parent); if (lower == sr->lower && upper == sr->upper) return sr; if (sr->max < upper) sr->max = upper; if (lower <= sr->lower) n = &parent->rb_left; else n = &parent->rb_right; } sr = kzalloc(sizeof(*sr), GFP_ATOMIC); if (!sr) return NULL; sr->lower = lower; sr->upper = upper; sr->max = upper; INIT_LIST_HEAD(&sr->local_publ); INIT_LIST_HEAD(&sr->all_publ); rb_link_node(&sr->tree_node, parent, n); rb_insert_augmented(&sr->tree_node, &sc->ranges, &sr_callbacks); return sr; } static bool tipc_service_insert_publ(struct net *net, struct tipc_service *sc, struct publication *p) { struct tipc_subscription *sub, *tmp; struct service_range *sr; struct publication *_p; u32 node = p->sk.node; bool first = false; bool res = false; u32 key = p->key; spin_lock_bh(&sc->lock); sr = tipc_service_create_range(sc, p); if (!sr) goto exit; first = list_empty(&sr->all_publ); /* Return if the publication already exists */ list_for_each_entry(_p, &sr->all_publ, all_publ) { if (_p->key == key && (!_p->sk.node || _p->sk.node == node)) { pr_debug("Failed to bind duplicate %u,%u,%u/%u:%u/%u\n", p->sr.type, p->sr.lower, p->sr.upper, node, p->sk.ref, key); goto exit; } } if (in_own_node(net, p->sk.node)) list_add(&p->local_publ, &sr->local_publ); list_add(&p->all_publ, &sr->all_publ); p->id = sc->publ_cnt++; /* Any subscriptions waiting for notification? */ list_for_each_entry_safe(sub, tmp, &sc->subscriptions, service_list) { tipc_sub_report_overlap(sub, p, TIPC_PUBLISHED, first); } res = true; exit: if (!res) pr_warn("Failed to bind to %u,%u,%u\n", p->sr.type, p->sr.lower, p->sr.upper); spin_unlock_bh(&sc->lock); return res; } /** * tipc_service_remove_publ - remove a publication from a service * @r: service_range to remove publication from * @sk: address publishing socket * @key: target publication key */ static struct publication *tipc_service_remove_publ(struct service_range *r, struct tipc_socket_addr *sk, u32 key) { struct publication *p; u32 node = sk->node; list_for_each_entry(p, &r->all_publ, all_publ) { if (p->key != key || (node && node != p->sk.node)) continue; list_del(&p->all_publ); list_del(&p->local_publ); return p; } return NULL; } /* * Code reused: time_after32() for the same purpose */ #define publication_after(pa, pb) time_after32((pa)->id, (pb)->id) static int tipc_publ_sort(void *priv, const struct list_head *a, const struct list_head *b) { struct publication *pa, *pb; pa = container_of(a, struct publication, list); pb = container_of(b, struct publication, list); return publication_after(pa, pb); } /** * tipc_service_subscribe - attach a subscription, and optionally * issue the prescribed number of events if there is any service * range overlapping with the requested range * @service: the tipc_service to attach the @sub to * @sub: the subscription to attach */ static void tipc_service_subscribe(struct tipc_service *service, struct tipc_subscription *sub) { struct publication *p, *first, *tmp; struct list_head publ_list; struct service_range *sr; u32 filter, lower, upper; filter = sub->s.filter; lower = sub->s.seq.lower; upper = sub->s.seq.upper; tipc_sub_get(sub); list_add(&sub->service_list, &service->subscriptions); if (filter & TIPC_SUB_NO_STATUS) return; INIT_LIST_HEAD(&publ_list); service_range_foreach_match(sr, service, lower, upper) { first = NULL; list_for_each_entry(p, &sr->all_publ, all_publ) { if (filter & TIPC_SUB_PORTS) list_add_tail(&p->list, &publ_list); else if (!first || publication_after(first, p)) /* Pick this range's *first* publication */ first = p; } if (first) list_add_tail(&first->list, &publ_list); } /* Sort the publications before reporting */ list_sort(NULL, &publ_list, tipc_publ_sort); list_for_each_entry_safe(p, tmp, &publ_list, list) { tipc_sub_report_overlap(sub, p, TIPC_PUBLISHED, true); list_del_init(&p->list); } } static struct tipc_service *tipc_service_find(struct net *net, struct tipc_uaddr *ua) { struct name_table *nt = tipc_name_table(net); struct hlist_head *service_head; struct tipc_service *service; service_head = &nt->services[hash(ua->sr.type)]; hlist_for_each_entry_rcu(service, service_head, service_list) { if (service->type == ua->sr.type) return service; } return NULL; }; struct publication *tipc_nametbl_insert_publ(struct net *net, struct tipc_uaddr *ua, struct tipc_socket_addr *sk, u32 key) { struct tipc_service *sc; struct publication *p; p = tipc_publ_create(ua, sk, key); if (!p) return NULL; sc = tipc_service_find(net, ua); if (!sc) sc = tipc_service_create(net, ua); if (sc && tipc_service_insert_publ(net, sc, p)) return p; kfree(p); return NULL; } struct publication *tipc_nametbl_remove_publ(struct net *net, struct tipc_uaddr *ua, struct tipc_socket_addr *sk, u32 key) { struct tipc_subscription *sub, *tmp; struct publication *p = NULL; struct service_range *sr; struct tipc_service *sc; bool last; sc = tipc_service_find(net, ua); if (!sc) goto exit; spin_lock_bh(&sc->lock); sr = tipc_service_find_range(sc, ua); if (!sr) goto unlock; p = tipc_service_remove_publ(sr, sk, key); if (!p) goto unlock; /* Notify any waiting subscriptions */ last = list_empty(&sr->all_publ); list_for_each_entry_safe(sub, tmp, &sc->subscriptions, service_list) { tipc_sub_report_overlap(sub, p, TIPC_WITHDRAWN, last); } /* Remove service range item if this was its last publication */ if (list_empty(&sr->all_publ)) { rb_erase_augmented(&sr->tree_node, &sc->ranges, &sr_callbacks); kfree(sr); } /* Delete service item if no more publications and subscriptions */ if (RB_EMPTY_ROOT(&sc->ranges) && list_empty(&sc->subscriptions)) { hlist_del_init_rcu(&sc->service_list); kfree_rcu(sc, rcu); } unlock: spin_unlock_bh(&sc->lock); exit: if (!p) { pr_err("Failed to remove unknown binding: %u,%u,%u/%u:%u/%u\n", ua->sr.type, ua->sr.lower, ua->sr.upper, sk->node, sk->ref, key); } return p; } /** * tipc_nametbl_lookup_anycast - perform service instance to socket translation * @net: network namespace * @ua: service address to look up * @sk: address to socket we want to find * * On entry, a non-zero 'sk->node' indicates the node where we want lookup to be * performed, which may not be this one. * * On exit: * * - If lookup is deferred to another node, leave 'sk->node' unchanged and * return 'true'. * - If lookup is successful, set the 'sk->node' and 'sk->ref' (== portid) which * represent the bound socket and return 'true'. * - If lookup fails, return 'false' * * Note that for legacy users (node configured with Z.C.N address format) the * 'closest-first' lookup algorithm must be maintained, i.e., if sk.node is 0 * we must look in the local binding list first */ bool tipc_nametbl_lookup_anycast(struct net *net, struct tipc_uaddr *ua, struct tipc_socket_addr *sk) { struct tipc_net *tn = tipc_net(net); bool legacy = tn->legacy_addr_format; u32 self = tipc_own_addr(net); u32 inst = ua->sa.instance; struct service_range *r; struct tipc_service *sc; struct publication *p; struct list_head *l; bool res = false; if (!tipc_in_scope(legacy, sk->node, self)) return true; rcu_read_lock(); sc = tipc_service_find(net, ua); if (unlikely(!sc)) goto exit; spin_lock_bh(&sc->lock); service_range_foreach_match(r, sc, inst, inst) { /* Select lookup algo: local, closest-first or round-robin */ if (sk->node == self) { l = &r->local_publ; if (list_empty(l)) continue; p = list_first_entry(l, struct publication, local_publ); list_move_tail(&p->local_publ, &r->local_publ); } else if (legacy && !sk->node && !list_empty(&r->local_publ)) { l = &r->local_publ; p = list_first_entry(l, struct publication, local_publ); list_move_tail(&p->local_publ, &r->local_publ); } else { l = &r->all_publ; p = list_first_entry(l, struct publication, all_publ); list_move_tail(&p->all_publ, &r->all_publ); } *sk = p->sk; res = true; /* Todo: as for legacy, pick the first matching range only, a * "true" round-robin will be performed as needed. */ break; } spin_unlock_bh(&sc->lock); exit: rcu_read_unlock(); return res; } /* tipc_nametbl_lookup_group(): lookup destinaton(s) in a communication group * Returns a list of one (== group anycast) or more (== group multicast) * destination socket/node pairs matching the given address. * The requester may or may not want to exclude himself from the list. */ bool tipc_nametbl_lookup_group(struct net *net, struct tipc_uaddr *ua, struct list_head *dsts, int *dstcnt, u32 exclude, bool mcast) { u32 self = tipc_own_addr(net); u32 inst = ua->sa.instance; struct service_range *sr; struct tipc_service *sc; struct publication *p; *dstcnt = 0; rcu_read_lock(); sc = tipc_service_find(net, ua); if (unlikely(!sc)) goto exit; spin_lock_bh(&sc->lock); /* Todo: a full search i.e. service_range_foreach_match() instead? */ sr = service_range_match_first(sc->ranges.rb_node, inst, inst); if (!sr) goto no_match; list_for_each_entry(p, &sr->all_publ, all_publ) { if (p->scope != ua->scope) continue; if (p->sk.ref == exclude && p->sk.node == self) continue; tipc_dest_push(dsts, p->sk.node, p->sk.ref); (*dstcnt)++; if (mcast) continue; list_move_tail(&p->all_publ, &sr->all_publ); break; } no_match: spin_unlock_bh(&sc->lock); exit: rcu_read_unlock(); return !list_empty(dsts); } /* tipc_nametbl_lookup_mcast_sockets(): look up node local destinaton sockets * matching the given address * Used on nodes which have received a multicast/broadcast message * Returns a list of local sockets */ void tipc_nametbl_lookup_mcast_sockets(struct net *net, struct tipc_uaddr *ua, struct list_head *dports) { struct service_range *sr; struct tipc_service *sc; struct publication *p; u8 scope = ua->scope; rcu_read_lock(); sc = tipc_service_find(net, ua); if (!sc) goto exit; spin_lock_bh(&sc->lock); service_range_foreach_match(sr, sc, ua->sr.lower, ua->sr.upper) { list_for_each_entry(p, &sr->local_publ, local_publ) { if (scope == p->scope || scope == TIPC_ANY_SCOPE) tipc_dest_push(dports, 0, p->sk.ref); } } spin_unlock_bh(&sc->lock); exit: rcu_read_unlock(); } /* tipc_nametbl_lookup_mcast_nodes(): look up all destination nodes matching * the given address. Used in sending node. * Used on nodes which are sending out a multicast/broadcast message * Returns a list of nodes, including own node if applicable */ void tipc_nametbl_lookup_mcast_nodes(struct net *net, struct tipc_uaddr *ua, struct tipc_nlist *nodes) { struct service_range *sr; struct tipc_service *sc; struct publication *p; rcu_read_lock(); sc = tipc_service_find(net, ua); if (!sc) goto exit; spin_lock_bh(&sc->lock); service_range_foreach_match(sr, sc, ua->sr.lower, ua->sr.upper) { list_for_each_entry(p, &sr->all_publ, all_publ) { tipc_nlist_add(nodes, p->sk.node); } } spin_unlock_bh(&sc->lock); exit: rcu_read_unlock(); } /* tipc_nametbl_build_group - build list of communication group members */ void tipc_nametbl_build_group(struct net *net, struct tipc_group *grp, struct tipc_uaddr *ua) { struct service_range *sr; struct tipc_service *sc; struct publication *p; struct rb_node *n; rcu_read_lock(); sc = tipc_service_find(net, ua); if (!sc) goto exit; spin_lock_bh(&sc->lock); for (n = rb_first(&sc->ranges); n; n = rb_next(n)) { sr = container_of(n, struct service_range, tree_node); list_for_each_entry(p, &sr->all_publ, all_publ) { if (p->scope != ua->scope) continue; tipc_group_add_member(grp, p->sk.node, p->sk.ref, p->sr.lower); } } spin_unlock_bh(&sc->lock); exit: rcu_read_unlock(); } /* tipc_nametbl_publish - add service binding to name table */ struct publication *tipc_nametbl_publish(struct net *net, struct tipc_uaddr *ua, struct tipc_socket_addr *sk, u32 key) { struct name_table *nt = tipc_name_table(net); struct tipc_net *tn = tipc_net(net); struct publication *p = NULL; struct sk_buff *skb = NULL; u32 rc_dests; spin_lock_bh(&tn->nametbl_lock); if (nt->local_publ_count >= TIPC_MAX_PUBL) { pr_warn("Bind failed, max limit %u reached\n", TIPC_MAX_PUBL); goto exit; } p = tipc_nametbl_insert_publ(net, ua, sk, key); if (p) { nt->local_publ_count++; skb = tipc_named_publish(net, p); } rc_dests = nt->rc_dests; exit: spin_unlock_bh(&tn->nametbl_lock); if (skb) tipc_node_broadcast(net, skb, rc_dests); return p; } /** * tipc_nametbl_withdraw - withdraw a service binding * @net: network namespace * @ua: service address/range being unbound * @sk: address of the socket being unbound from * @key: target publication key */ void tipc_nametbl_withdraw(struct net *net, struct tipc_uaddr *ua, struct tipc_socket_addr *sk, u32 key) { struct name_table *nt = tipc_name_table(net); struct tipc_net *tn = tipc_net(net); struct sk_buff *skb = NULL; struct publication *p; u32 rc_dests; spin_lock_bh(&tn->nametbl_lock); p = tipc_nametbl_remove_publ(net, ua, sk, key); if (p) { nt->local_publ_count--; skb = tipc_named_withdraw(net, p); list_del_init(&p->binding_sock); kfree_rcu(p, rcu); } rc_dests = nt->rc_dests; spin_unlock_bh(&tn->nametbl_lock); if (skb) tipc_node_broadcast(net, skb, rc_dests); } /** * tipc_nametbl_subscribe - add a subscription object to the name table * @sub: subscription to add */ bool tipc_nametbl_subscribe(struct tipc_subscription *sub) { struct tipc_net *tn = tipc_net(sub->net); u32 type = sub->s.seq.type; struct tipc_service *sc; struct tipc_uaddr ua; bool res = true; tipc_uaddr(&ua, TIPC_SERVICE_RANGE, TIPC_NODE_SCOPE, type, sub->s.seq.lower, sub->s.seq.upper); spin_lock_bh(&tn->nametbl_lock); sc = tipc_service_find(sub->net, &ua); if (!sc) sc = tipc_service_create(sub->net, &ua); if (sc) { spin_lock_bh(&sc->lock); tipc_service_subscribe(sc, sub); spin_unlock_bh(&sc->lock); } else { pr_warn("Failed to subscribe for {%u,%u,%u}\n", type, sub->s.seq.lower, sub->s.seq.upper); res = false; } spin_unlock_bh(&tn->nametbl_lock); return res; } /** * tipc_nametbl_unsubscribe - remove a subscription object from name table * @sub: subscription to remove */ void tipc_nametbl_unsubscribe(struct tipc_subscription *sub) { struct tipc_net *tn = tipc_net(sub->net); struct tipc_service *sc; struct tipc_uaddr ua; tipc_uaddr(&ua, TIPC_SERVICE_RANGE, TIPC_NODE_SCOPE, sub->s.seq.type, sub->s.seq.lower, sub->s.seq.upper); spin_lock_bh(&tn->nametbl_lock); sc = tipc_service_find(sub->net, &ua); if (!sc) goto exit; spin_lock_bh(&sc->lock); list_del_init(&sub->service_list); tipc_sub_put(sub); /* Delete service item if no more publications and subscriptions */ if (RB_EMPTY_ROOT(&sc->ranges) && list_empty(&sc->subscriptions)) { hlist_del_init_rcu(&sc->service_list); kfree_rcu(sc, rcu); } spin_unlock_bh(&sc->lock); exit: spin_unlock_bh(&tn->nametbl_lock); } int tipc_nametbl_init(struct net *net) { struct tipc_net *tn = tipc_net(net); struct name_table *nt; int i; nt = kzalloc(sizeof(*nt), GFP_KERNEL); if (!nt) return -ENOMEM; for (i = 0; i < TIPC_NAMETBL_SIZE; i++) INIT_HLIST_HEAD(&nt->services[i]); INIT_LIST_HEAD(&nt->node_scope); INIT_LIST_HEAD(&nt->cluster_scope); rwlock_init(&nt->cluster_scope_lock); tn->nametbl = nt; spin_lock_init(&tn->nametbl_lock); return 0; } /** * tipc_service_delete - purge all publications for a service and delete it * @net: the associated network namespace * @sc: tipc_service to delete */ static void tipc_service_delete(struct net *net, struct tipc_service *sc) { struct service_range *sr, *tmpr; struct publication *p, *tmp; spin_lock_bh(&sc->lock); rbtree_postorder_for_each_entry_safe(sr, tmpr, &sc->ranges, tree_node) { list_for_each_entry_safe(p, tmp, &sr->all_publ, all_publ) { tipc_service_remove_publ(sr, &p->sk, p->key); kfree_rcu(p, rcu); } rb_erase_augmented(&sr->tree_node, &sc->ranges, &sr_callbacks); kfree(sr); } hlist_del_init_rcu(&sc->service_list); spin_unlock_bh(&sc->lock); kfree_rcu(sc, rcu); } void tipc_nametbl_stop(struct net *net) { struct name_table *nt = tipc_name_table(net); struct tipc_net *tn = tipc_net(net); struct hlist_head *service_head; struct tipc_service *service; u32 i; /* Verify name table is empty and purge any lingering * publications, then release the name table */ spin_lock_bh(&tn->nametbl_lock); for (i = 0; i < TIPC_NAMETBL_SIZE; i++) { if (hlist_empty(&nt->services[i])) continue; service_head = &nt->services[i]; hlist_for_each_entry_rcu(service, service_head, service_list) { tipc_service_delete(net, service); } } spin_unlock_bh(&tn->nametbl_lock); /* TODO: clear tn->nametbl, implement proper RCU rules ? */ kfree_rcu(nt, rcu); } static int __tipc_nl_add_nametable_publ(struct tipc_nl_msg *msg, struct tipc_service *service, struct service_range *sr, u32 *last_key) { struct publication *p; struct nlattr *attrs; struct nlattr *b; void *hdr; if (*last_key) { list_for_each_entry(p, &sr->all_publ, all_publ) if (p->key == *last_key) break; if (list_entry_is_head(p, &sr->all_publ, all_publ)) return -EPIPE; } else { p = list_first_entry(&sr->all_publ, struct publication, all_publ); } list_for_each_entry_from(p, &sr->all_publ, all_publ) { *last_key = p->key; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, NLM_F_MULTI, TIPC_NL_NAME_TABLE_GET); if (!hdr) return -EMSGSIZE; attrs = nla_nest_start_noflag(msg->skb, TIPC_NLA_NAME_TABLE); if (!attrs) goto msg_full; b = nla_nest_start_noflag(msg->skb, TIPC_NLA_NAME_TABLE_PUBL); if (!b) goto attr_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_TYPE, service->type)) goto publ_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_LOWER, sr->lower)) goto publ_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_UPPER, sr->upper)) goto publ_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_SCOPE, p->scope)) goto publ_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_NODE, p->sk.node)) goto publ_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_REF, p->sk.ref)) goto publ_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_KEY, p->key)) goto publ_msg_full; nla_nest_end(msg->skb, b); nla_nest_end(msg->skb, attrs); genlmsg_end(msg->skb, hdr); } *last_key = 0; return 0; publ_msg_full: nla_nest_cancel(msg->skb, b); attr_msg_full: nla_nest_cancel(msg->skb, attrs); msg_full: genlmsg_cancel(msg->skb, hdr); return -EMSGSIZE; } static int __tipc_nl_service_range_list(struct tipc_nl_msg *msg, struct tipc_service *sc, u32 *last_lower, u32 *last_key) { struct service_range *sr; struct rb_node *n; int err; for (n = rb_first(&sc->ranges); n; n = rb_next(n)) { sr = container_of(n, struct service_range, tree_node); if (sr->lower < *last_lower) continue; err = __tipc_nl_add_nametable_publ(msg, sc, sr, last_key); if (err) { *last_lower = sr->lower; return err; } } *last_lower = 0; return 0; } static int tipc_nl_service_list(struct net *net, struct tipc_nl_msg *msg, u32 *last_type, u32 *last_lower, u32 *last_key) { struct tipc_net *tn = tipc_net(net); struct tipc_service *service = NULL; struct hlist_head *head; struct tipc_uaddr ua; int err; int i; if (*last_type) i = hash(*last_type); else i = 0; for (; i < TIPC_NAMETBL_SIZE; i++) { head = &tn->nametbl->services[i]; if (*last_type || (!i && *last_key && (*last_lower == *last_key))) { tipc_uaddr(&ua, TIPC_SERVICE_RANGE, TIPC_NODE_SCOPE, *last_type, *last_lower, *last_lower); service = tipc_service_find(net, &ua); if (!service) return -EPIPE; } else { hlist_for_each_entry_rcu(service, head, service_list) break; if (!service) continue; } hlist_for_each_entry_from_rcu(service, service_list) { spin_lock_bh(&service->lock); err = __tipc_nl_service_range_list(msg, service, last_lower, last_key); if (err) { *last_type = service->type; spin_unlock_bh(&service->lock); return err; } spin_unlock_bh(&service->lock); } *last_type = 0; } return 0; } int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); u32 last_type = cb->args[0]; u32 last_lower = cb->args[1]; u32 last_key = cb->args[2]; int done = cb->args[3]; struct tipc_nl_msg msg; int err; if (done) return 0; msg.skb = skb; msg.portid = NETLINK_CB(cb->skb).portid; msg.seq = cb->nlh->nlmsg_seq; rcu_read_lock(); err = tipc_nl_service_list(net, &msg, &last_type, &last_lower, &last_key); if (!err) { done = 1; } else if (err != -EMSGSIZE) { /* We never set seq or call nl_dump_check_consistent() this * means that setting prev_seq here will cause the consistence * check to fail in the netlink callback handler. Resulting in * the NLMSG_DONE message having the NLM_F_DUMP_INTR flag set if * we got an error. */ cb->prev_seq = 1; } rcu_read_unlock(); cb->args[0] = last_type; cb->args[1] = last_lower; cb->args[2] = last_key; cb->args[3] = done; return skb->len; } struct tipc_dest *tipc_dest_find(struct list_head *l, u32 node, u32 port) { struct tipc_dest *dst; list_for_each_entry(dst, l, list) { if (dst->node == node && dst->port == port) return dst; } return NULL; } bool tipc_dest_push(struct list_head *l, u32 node, u32 port) { struct tipc_dest *dst; if (tipc_dest_find(l, node, port)) return false; dst = kmalloc(sizeof(*dst), GFP_ATOMIC); if (unlikely(!dst)) return false; dst->node = node; dst->port = port; list_add(&dst->list, l); return true; } bool tipc_dest_pop(struct list_head *l, u32 *node, u32 *port) { struct tipc_dest *dst; if (list_empty(l)) return false; dst = list_first_entry(l, typeof(*dst), list); if (port) *port = dst->port; if (node) *node = dst->node; list_del(&dst->list); kfree(dst); return true; } bool tipc_dest_del(struct list_head *l, u32 node, u32 port) { struct tipc_dest *dst; dst = tipc_dest_find(l, node, port); if (!dst) return false; list_del(&dst->list); kfree(dst); return true; } void tipc_dest_list_purge(struct list_head *l) { struct tipc_dest *dst, *tmp; list_for_each_entry_safe(dst, tmp, l, list) { list_del(&dst->list); kfree(dst); } }
34 34 3 3 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 // SPDX-License-Identifier: GPL-2.0-or-later /****************************************************************************** * vlanproc.c VLAN Module. /proc filesystem interface. * * This module is completely hardware-independent and provides * access to the router using Linux /proc filesystem. * * Author: Ben Greear, <greearb@candelatech.com> coppied from wanproc.c * by: Gene Kozin <genek@compuserve.com> * * Copyright: (c) 1998 Ben Greear * * ============================================================================ * Jan 20, 1998 Ben Greear Initial Version *****************************************************************************/ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/fs.h> #include <linux/netdevice.h> #include <linux/if_vlan.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include "vlanproc.h" #include "vlan.h" /****** Function Prototypes *************************************************/ /* Methods for preparing data for reading proc entries */ static int vlan_seq_show(struct seq_file *seq, void *v); static void *vlan_seq_start(struct seq_file *seq, loff_t *pos); static void *vlan_seq_next(struct seq_file *seq, void *v, loff_t *pos); static void vlan_seq_stop(struct seq_file *seq, void *); static int vlandev_seq_show(struct seq_file *seq, void *v); /* * Global Data */ /* * Names of the proc directory entries */ static const char name_root[] = "vlan"; static const char name_conf[] = "config"; /* * Structures for interfacing with the /proc filesystem. * VLAN creates its own directory /proc/net/vlan with the following * entries: * config device status/configuration * <device> entry for each device */ /* * Generic /proc/net/vlan/<file> file and inode operations */ static const struct seq_operations vlan_seq_ops = { .start = vlan_seq_start, .next = vlan_seq_next, .stop = vlan_seq_stop, .show = vlan_seq_show, }; /* * Proc filesystem directory entries. */ /* Strings */ static const char *const vlan_name_type_str[VLAN_NAME_TYPE_HIGHEST] = { [VLAN_NAME_TYPE_RAW_PLUS_VID] = "VLAN_NAME_TYPE_RAW_PLUS_VID", [VLAN_NAME_TYPE_PLUS_VID_NO_PAD] = "VLAN_NAME_TYPE_PLUS_VID_NO_PAD", [VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD] = "VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD", [VLAN_NAME_TYPE_PLUS_VID] = "VLAN_NAME_TYPE_PLUS_VID", }; /* * Interface functions */ /* * Clean up /proc/net/vlan entries */ void vlan_proc_cleanup(struct net *net) { struct vlan_net *vn = net_generic(net, vlan_net_id); if (vn->proc_vlan_conf) remove_proc_entry(name_conf, vn->proc_vlan_dir); if (vn->proc_vlan_dir) remove_proc_entry(name_root, net->proc_net); /* Dynamically added entries should be cleaned up as their vlan_device * is removed, so we should not have to take care of it here... */ } /* * Create /proc/net/vlan entries */ int __net_init vlan_proc_init(struct net *net) { struct vlan_net *vn = net_generic(net, vlan_net_id); vn->proc_vlan_dir = proc_net_mkdir(net, name_root, net->proc_net); if (!vn->proc_vlan_dir) goto err; vn->proc_vlan_conf = proc_create_net(name_conf, S_IFREG | 0600, vn->proc_vlan_dir, &vlan_seq_ops, sizeof(struct seq_net_private)); if (!vn->proc_vlan_conf) goto err; return 0; err: pr_err("can't create entry in proc filesystem!\n"); vlan_proc_cleanup(net); return -ENOBUFS; } /* * Add directory entry for VLAN device. */ int vlan_proc_add_dev(struct net_device *vlandev) { struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev); struct vlan_net *vn = net_generic(dev_net(vlandev), vlan_net_id); if (!strcmp(vlandev->name, name_conf)) return -EINVAL; vlan->dent = proc_create_single_data(vlandev->name, S_IFREG | 0600, vn->proc_vlan_dir, vlandev_seq_show, vlandev); if (!vlan->dent) return -ENOBUFS; return 0; } /* * Delete directory entry for VLAN device. */ void vlan_proc_rem_dev(struct net_device *vlandev) { /** NOTE: This will consume the memory pointed to by dent, it seems. */ proc_remove(vlan_dev_priv(vlandev)->dent); vlan_dev_priv(vlandev)->dent = NULL; } /****** Proc filesystem entry points ****************************************/ /* * The following few functions build the content of /proc/net/vlan/config */ static void *vlan_seq_from_index(struct seq_file *seq, loff_t *pos) { unsigned long ifindex = *pos; struct net_device *dev; for_each_netdev_dump(seq_file_net(seq), dev, ifindex) { if (!is_vlan_dev(dev)) continue; *pos = dev->ifindex; return dev; } return NULL; } static void *vlan_seq_start(struct seq_file *seq, loff_t *pos) __acquires(rcu) { rcu_read_lock(); if (*pos == 0) return SEQ_START_TOKEN; return vlan_seq_from_index(seq, pos); } static void *vlan_seq_next(struct seq_file *seq, void *v, loff_t *pos) { ++*pos; return vlan_seq_from_index(seq, pos); } static void vlan_seq_stop(struct seq_file *seq, void *v) __releases(rcu) { rcu_read_unlock(); } static int vlan_seq_show(struct seq_file *seq, void *v) { struct net *net = seq_file_net(seq); struct vlan_net *vn = net_generic(net, vlan_net_id); if (v == SEQ_START_TOKEN) { const char *nmtype = NULL; seq_puts(seq, "VLAN Dev name | VLAN ID\n"); if (vn->name_type < ARRAY_SIZE(vlan_name_type_str)) nmtype = vlan_name_type_str[vn->name_type]; seq_printf(seq, "Name-Type: %s\n", nmtype ? nmtype : "UNKNOWN"); } else { const struct net_device *vlandev = v; const struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev); seq_printf(seq, "%-15s| %d | %s\n", vlandev->name, vlan->vlan_id, vlan->real_dev->name); } return 0; } static int vlandev_seq_show(struct seq_file *seq, void *offset) { struct net_device *vlandev = (struct net_device *) seq->private; const struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev); struct rtnl_link_stats64 temp; const struct rtnl_link_stats64 *stats; static const char fmt64[] = "%30s %12llu\n"; int i; if (!is_vlan_dev(vlandev)) return 0; stats = dev_get_stats(vlandev, &temp); seq_printf(seq, "%s VID: %d REORDER_HDR: %i dev->priv_flags: %x\n", vlandev->name, vlan->vlan_id, (int)(vlan->flags & 1), (u32)vlandev->priv_flags); seq_printf(seq, fmt64, "total frames received", stats->rx_packets); seq_printf(seq, fmt64, "total bytes received", stats->rx_bytes); seq_printf(seq, fmt64, "Broadcast/Multicast Rcvd", stats->multicast); seq_puts(seq, "\n"); seq_printf(seq, fmt64, "total frames transmitted", stats->tx_packets); seq_printf(seq, fmt64, "total bytes transmitted", stats->tx_bytes); seq_printf(seq, "Device: %s", vlan->real_dev->name); /* now show all PRIORITY mappings relating to this VLAN */ seq_printf(seq, "\nINGRESS priority mappings: " "0:%u 1:%u 2:%u 3:%u 4:%u 5:%u 6:%u 7:%u\n", vlan->ingress_priority_map[0], vlan->ingress_priority_map[1], vlan->ingress_priority_map[2], vlan->ingress_priority_map[3], vlan->ingress_priority_map[4], vlan->ingress_priority_map[5], vlan->ingress_priority_map[6], vlan->ingress_priority_map[7]); seq_printf(seq, " EGRESS priority mappings: "); for (i = 0; i < 16; i++) { const struct vlan_priority_tci_mapping *mp = vlan->egress_priority_map[i]; while (mp) { seq_printf(seq, "%u:%d ", mp->priority, ((mp->vlan_qos >> 13) & 0x7)); mp = mp->next; } } seq_puts(seq, "\n"); return 0; }
5 5 5 5 5 2 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM vmscan #if !defined(_TRACE_VMSCAN_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_VMSCAN_H #include <linux/types.h> #include <linux/tracepoint.h> #include <linux/mm.h> #include <linux/memcontrol.h> #include <trace/events/mmflags.h> #define RECLAIM_WB_ANON 0x0001u #define RECLAIM_WB_FILE 0x0002u #define RECLAIM_WB_MIXED 0x0010u #define RECLAIM_WB_SYNC 0x0004u /* Unused, all reclaim async */ #define RECLAIM_WB_ASYNC 0x0008u #define RECLAIM_WB_LRU (RECLAIM_WB_ANON|RECLAIM_WB_FILE) #define show_reclaim_flags(flags) \ (flags) ? __print_flags(flags, "|", \ {RECLAIM_WB_ANON, "RECLAIM_WB_ANON"}, \ {RECLAIM_WB_FILE, "RECLAIM_WB_FILE"}, \ {RECLAIM_WB_MIXED, "RECLAIM_WB_MIXED"}, \ {RECLAIM_WB_SYNC, "RECLAIM_WB_SYNC"}, \ {RECLAIM_WB_ASYNC, "RECLAIM_WB_ASYNC"} \ ) : "RECLAIM_WB_NONE" #define _VMSCAN_THROTTLE_WRITEBACK (1 << VMSCAN_THROTTLE_WRITEBACK) #define _VMSCAN_THROTTLE_ISOLATED (1 << VMSCAN_THROTTLE_ISOLATED) #define _VMSCAN_THROTTLE_NOPROGRESS (1 << VMSCAN_THROTTLE_NOPROGRESS) #define _VMSCAN_THROTTLE_CONGESTED (1 << VMSCAN_THROTTLE_CONGESTED) #define show_throttle_flags(flags) \ (flags) ? __print_flags(flags, "|", \ {_VMSCAN_THROTTLE_WRITEBACK, "VMSCAN_THROTTLE_WRITEBACK"}, \ {_VMSCAN_THROTTLE_ISOLATED, "VMSCAN_THROTTLE_ISOLATED"}, \ {_VMSCAN_THROTTLE_NOPROGRESS, "VMSCAN_THROTTLE_NOPROGRESS"}, \ {_VMSCAN_THROTTLE_CONGESTED, "VMSCAN_THROTTLE_CONGESTED"} \ ) : "VMSCAN_THROTTLE_NONE" #define trace_reclaim_flags(file) ( \ (file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \ (RECLAIM_WB_ASYNC) \ ) TRACE_EVENT(mm_vmscan_kswapd_sleep, TP_PROTO(int nid), TP_ARGS(nid), TP_STRUCT__entry( __field( int, nid ) ), TP_fast_assign( __entry->nid = nid; ), TP_printk("nid=%d", __entry->nid) ); TRACE_EVENT(mm_vmscan_kswapd_wake, TP_PROTO(int nid, int zid, int order), TP_ARGS(nid, zid, order), TP_STRUCT__entry( __field( int, nid ) __field( int, zid ) __field( int, order ) ), TP_fast_assign( __entry->nid = nid; __entry->zid = zid; __entry->order = order; ), TP_printk("nid=%d order=%d", __entry->nid, __entry->order) ); TRACE_EVENT(mm_vmscan_wakeup_kswapd, TP_PROTO(int nid, int zid, int order, gfp_t gfp_flags), TP_ARGS(nid, zid, order, gfp_flags), TP_STRUCT__entry( __field( int, nid ) __field( int, zid ) __field( int, order ) __field( unsigned long, gfp_flags ) ), TP_fast_assign( __entry->nid = nid; __entry->zid = zid; __entry->order = order; __entry->gfp_flags = (__force unsigned long)gfp_flags; ), TP_printk("nid=%d order=%d gfp_flags=%s", __entry->nid, __entry->order, show_gfp_flags(__entry->gfp_flags)) ); DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_begin_template, TP_PROTO(int order, gfp_t gfp_flags), TP_ARGS(order, gfp_flags), TP_STRUCT__entry( __field( int, order ) __field( unsigned long, gfp_flags ) ), TP_fast_assign( __entry->order = order; __entry->gfp_flags = (__force unsigned long)gfp_flags; ), TP_printk("order=%d gfp_flags=%s", __entry->order, show_gfp_flags(__entry->gfp_flags)) ); DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_direct_reclaim_begin, TP_PROTO(int order, gfp_t gfp_flags), TP_ARGS(order, gfp_flags) ); #ifdef CONFIG_MEMCG DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_reclaim_begin, TP_PROTO(int order, gfp_t gfp_flags), TP_ARGS(order, gfp_flags) ); DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_softlimit_reclaim_begin, TP_PROTO(int order, gfp_t gfp_flags), TP_ARGS(order, gfp_flags) ); #endif /* CONFIG_MEMCG */ DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_end_template, TP_PROTO(unsigned long nr_reclaimed), TP_ARGS(nr_reclaimed), TP_STRUCT__entry( __field( unsigned long, nr_reclaimed ) ), TP_fast_assign( __entry->nr_reclaimed = nr_reclaimed; ), TP_printk("nr_reclaimed=%lu", __entry->nr_reclaimed) ); DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_direct_reclaim_end, TP_PROTO(unsigned long nr_reclaimed), TP_ARGS(nr_reclaimed) ); #ifdef CONFIG_MEMCG DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_memcg_reclaim_end, TP_PROTO(unsigned long nr_reclaimed), TP_ARGS(nr_reclaimed) ); DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_memcg_softlimit_reclaim_end, TP_PROTO(unsigned long nr_reclaimed), TP_ARGS(nr_reclaimed) ); #endif /* CONFIG_MEMCG */ TRACE_EVENT(mm_shrink_slab_start, TP_PROTO(struct shrinker *shr, struct shrink_control *sc, long nr_objects_to_shrink, unsigned long cache_items, unsigned long long delta, unsigned long total_scan, int priority), TP_ARGS(shr, sc, nr_objects_to_shrink, cache_items, delta, total_scan, priority), TP_STRUCT__entry( __field(struct shrinker *, shr) __field(void *, shrink) __field(int, nid) __field(long, nr_objects_to_shrink) __field(unsigned long, gfp_flags) __field(unsigned long, cache_items) __field(unsigned long long, delta) __field(unsigned long, total_scan) __field(int, priority) ), TP_fast_assign( __entry->shr = shr; __entry->shrink = shr->scan_objects; __entry->nid = sc->nid; __entry->nr_objects_to_shrink = nr_objects_to_shrink; __entry->gfp_flags = (__force unsigned long)sc->gfp_mask; __entry->cache_items = cache_items; __entry->delta = delta; __entry->total_scan = total_scan; __entry->priority = priority; ), TP_printk("%pS %p: nid: %d objects to shrink %ld gfp_flags %s cache items %ld delta %lld total_scan %ld priority %d", __entry->shrink, __entry->shr, __entry->nid, __entry->nr_objects_to_shrink, show_gfp_flags(__entry->gfp_flags), __entry->cache_items, __entry->delta, __entry->total_scan, __entry->priority) ); TRACE_EVENT(mm_shrink_slab_end, TP_PROTO(struct shrinker *shr, int nid, int shrinker_retval, long unused_scan_cnt, long new_scan_cnt, long total_scan), TP_ARGS(shr, nid, shrinker_retval, unused_scan_cnt, new_scan_cnt, total_scan), TP_STRUCT__entry( __field(struct shrinker *, shr) __field(int, nid) __field(void *, shrink) __field(long, unused_scan) __field(long, new_scan) __field(int, retval) __field(long, total_scan) ), TP_fast_assign( __entry->shr = shr; __entry->nid = nid; __entry->shrink = shr->scan_objects; __entry->unused_scan = unused_scan_cnt; __entry->new_scan = new_scan_cnt; __entry->retval = shrinker_retval; __entry->total_scan = total_scan; ), TP_printk("%pS %p: nid: %d unused scan count %ld new scan count %ld total_scan %ld last shrinker return val %d", __entry->shrink, __entry->shr, __entry->nid, __entry->unused_scan, __entry->new_scan, __entry->total_scan, __entry->retval) ); TRACE_EVENT(mm_vmscan_lru_isolate, TP_PROTO(int highest_zoneidx, int order, unsigned long nr_requested, unsigned long nr_scanned, unsigned long nr_skipped, unsigned long nr_taken, int lru), TP_ARGS(highest_zoneidx, order, nr_requested, nr_scanned, nr_skipped, nr_taken, lru), TP_STRUCT__entry( __field(int, highest_zoneidx) __field(int, order) __field(unsigned long, nr_requested) __field(unsigned long, nr_scanned) __field(unsigned long, nr_skipped) __field(unsigned long, nr_taken) __field(int, lru) ), TP_fast_assign( __entry->highest_zoneidx = highest_zoneidx; __entry->order = order; __entry->nr_requested = nr_requested; __entry->nr_scanned = nr_scanned; __entry->nr_skipped = nr_skipped; __entry->nr_taken = nr_taken; __entry->lru = lru; ), /* * classzone is previous name of the highest_zoneidx. * Reason not to change it is the ABI requirement of the tracepoint. */ TP_printk("classzone=%d order=%d nr_requested=%lu nr_scanned=%lu nr_skipped=%lu nr_taken=%lu lru=%s", __entry->highest_zoneidx, __entry->order, __entry->nr_requested, __entry->nr_scanned, __entry->nr_skipped, __entry->nr_taken, __print_symbolic(__entry->lru, LRU_NAMES)) ); TRACE_EVENT(mm_vmscan_write_folio, TP_PROTO(struct folio *folio), TP_ARGS(folio), TP_STRUCT__entry( __field(unsigned long, pfn) __field(int, reclaim_flags) ), TP_fast_assign( __entry->pfn = folio_pfn(folio); __entry->reclaim_flags = trace_reclaim_flags( folio_is_file_lru(folio)); ), TP_printk("page=%p pfn=0x%lx flags=%s", pfn_to_page(__entry->pfn), __entry->pfn, show_reclaim_flags(__entry->reclaim_flags)) ); TRACE_EVENT(mm_vmscan_reclaim_pages, TP_PROTO(int nid, unsigned long nr_scanned, unsigned long nr_reclaimed, struct reclaim_stat *stat), TP_ARGS(nid, nr_scanned, nr_reclaimed, stat), TP_STRUCT__entry( __field(int, nid) __field(unsigned long, nr_scanned) __field(unsigned long, nr_reclaimed) __field(unsigned long, nr_dirty) __field(unsigned long, nr_writeback) __field(unsigned long, nr_congested) __field(unsigned long, nr_immediate) __field(unsigned int, nr_activate0) __field(unsigned int, nr_activate1) __field(unsigned long, nr_ref_keep) __field(unsigned long, nr_unmap_fail) ), TP_fast_assign( __entry->nid = nid; __entry->nr_scanned = nr_scanned; __entry->nr_reclaimed = nr_reclaimed; __entry->nr_dirty = stat->nr_dirty; __entry->nr_writeback = stat->nr_writeback; __entry->nr_congested = stat->nr_congested; __entry->nr_immediate = stat->nr_immediate; __entry->nr_activate0 = stat->nr_activate[0]; __entry->nr_activate1 = stat->nr_activate[1]; __entry->nr_ref_keep = stat->nr_ref_keep; __entry->nr_unmap_fail = stat->nr_unmap_fail; ), TP_printk("nid=%d nr_scanned=%ld nr_reclaimed=%ld nr_dirty=%ld nr_writeback=%ld nr_congested=%ld nr_immediate=%ld nr_activate_anon=%d nr_activate_file=%d nr_ref_keep=%ld nr_unmap_fail=%ld", __entry->nid, __entry->nr_scanned, __entry->nr_reclaimed, __entry->nr_dirty, __entry->nr_writeback, __entry->nr_congested, __entry->nr_immediate, __entry->nr_activate0, __entry->nr_activate1, __entry->nr_ref_keep, __entry->nr_unmap_fail) ); TRACE_EVENT(mm_vmscan_lru_shrink_inactive, TP_PROTO(int nid, unsigned long nr_scanned, unsigned long nr_reclaimed, struct reclaim_stat *stat, int priority, int file), TP_ARGS(nid, nr_scanned, nr_reclaimed, stat, priority, file), TP_STRUCT__entry( __field(int, nid) __field(unsigned long, nr_scanned) __field(unsigned long, nr_reclaimed) __field(unsigned long, nr_dirty) __field(unsigned long, nr_writeback) __field(unsigned long, nr_congested) __field(unsigned long, nr_immediate) __field(unsigned int, nr_activate0) __field(unsigned int, nr_activate1) __field(unsigned long, nr_ref_keep) __field(unsigned long, nr_unmap_fail) __field(int, priority) __field(int, reclaim_flags) ), TP_fast_assign( __entry->nid = nid; __entry->nr_scanned = nr_scanned; __entry->nr_reclaimed = nr_reclaimed; __entry->nr_dirty = stat->nr_dirty; __entry->nr_writeback = stat->nr_writeback; __entry->nr_congested = stat->nr_congested; __entry->nr_immediate = stat->nr_immediate; __entry->nr_activate0 = stat->nr_activate[0]; __entry->nr_activate1 = stat->nr_activate[1]; __entry->nr_ref_keep = stat->nr_ref_keep; __entry->nr_unmap_fail = stat->nr_unmap_fail; __entry->priority = priority; __entry->reclaim_flags = trace_reclaim_flags(file); ), TP_printk("nid=%d nr_scanned=%ld nr_reclaimed=%ld nr_dirty=%ld nr_writeback=%ld nr_congested=%ld nr_immediate=%ld nr_activate_anon=%d nr_activate_file=%d nr_ref_keep=%ld nr_unmap_fail=%ld priority=%d flags=%s", __entry->nid, __entry->nr_scanned, __entry->nr_reclaimed, __entry->nr_dirty, __entry->nr_writeback, __entry->nr_congested, __entry->nr_immediate, __entry->nr_activate0, __entry->nr_activate1, __entry->nr_ref_keep, __entry->nr_unmap_fail, __entry->priority, show_reclaim_flags(__entry->reclaim_flags)) ); TRACE_EVENT(mm_vmscan_lru_shrink_active, TP_PROTO(int nid, unsigned long nr_taken, unsigned long nr_active, unsigned long nr_deactivated, unsigned long nr_referenced, int priority, int file), TP_ARGS(nid, nr_taken, nr_active, nr_deactivated, nr_referenced, priority, file), TP_STRUCT__entry( __field(int, nid) __field(unsigned long, nr_taken) __field(unsigned long, nr_active) __field(unsigned long, nr_deactivated) __field(unsigned long, nr_referenced) __field(int, priority) __field(int, reclaim_flags) ), TP_fast_assign( __entry->nid = nid; __entry->nr_taken = nr_taken; __entry->nr_active = nr_active; __entry->nr_deactivated = nr_deactivated; __entry->nr_referenced = nr_referenced; __entry->priority = priority; __entry->reclaim_flags = trace_reclaim_flags(file); ), TP_printk("nid=%d nr_taken=%ld nr_active=%ld nr_deactivated=%ld nr_referenced=%ld priority=%d flags=%s", __entry->nid, __entry->nr_taken, __entry->nr_active, __entry->nr_deactivated, __entry->nr_referenced, __entry->priority, show_reclaim_flags(__entry->reclaim_flags)) ); TRACE_EVENT(mm_vmscan_node_reclaim_begin, TP_PROTO(int nid, int order, gfp_t gfp_flags), TP_ARGS(nid, order, gfp_flags), TP_STRUCT__entry( __field(int, nid) __field(int, order) __field(unsigned long, gfp_flags) ), TP_fast_assign( __entry->nid = nid; __entry->order = order; __entry->gfp_flags = (__force unsigned long)gfp_flags; ), TP_printk("nid=%d order=%d gfp_flags=%s", __entry->nid, __entry->order, show_gfp_flags(__entry->gfp_flags)) ); DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_node_reclaim_end, TP_PROTO(unsigned long nr_reclaimed), TP_ARGS(nr_reclaimed) ); TRACE_EVENT(mm_vmscan_throttled, TP_PROTO(int nid, int usec_timeout, int usec_delayed, int reason), TP_ARGS(nid, usec_timeout, usec_delayed, reason), TP_STRUCT__entry( __field(int, nid) __field(int, usec_timeout) __field(int, usec_delayed) __field(int, reason) ), TP_fast_assign( __entry->nid = nid; __entry->usec_timeout = usec_timeout; __entry->usec_delayed = usec_delayed; __entry->reason = 1U << reason; ), TP_printk("nid=%d usec_timeout=%d usect_delayed=%d reason=%s", __entry->nid, __entry->usec_timeout, __entry->usec_delayed, show_throttle_flags(__entry->reason)) ); #endif /* _TRACE_VMSCAN_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
15 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 // SPDX-License-Identifier: GPL-2.0-only /* * Yama Linux Security Module * * Author: Kees Cook <keescook@chromium.org> * * Copyright (C) 2010 Canonical, Ltd. * Copyright (C) 2011 The Chromium OS Authors. */ #include <linux/lsm_hooks.h> #include <linux/sysctl.h> #include <linux/ptrace.h> #include <linux/prctl.h> #include <linux/ratelimit.h> #include <linux/workqueue.h> #include <linux/string_helpers.h> #include <linux/task_work.h> #include <linux/sched.h> #include <linux/spinlock.h> #include <uapi/linux/lsm.h> #define YAMA_SCOPE_DISABLED 0 #define YAMA_SCOPE_RELATIONAL 1 #define YAMA_SCOPE_CAPABILITY 2 #define YAMA_SCOPE_NO_ATTACH 3 static int ptrace_scope = YAMA_SCOPE_RELATIONAL; /* describe a ptrace relationship for potential exception */ struct ptrace_relation { struct task_struct *tracer; struct task_struct *tracee; bool invalid; struct list_head node; struct rcu_head rcu; }; static LIST_HEAD(ptracer_relations); static DEFINE_SPINLOCK(ptracer_relations_lock); static void yama_relation_cleanup(struct work_struct *work); static DECLARE_WORK(yama_relation_work, yama_relation_cleanup); struct access_report_info { struct callback_head work; const char *access; struct task_struct *target; struct task_struct *agent; }; static void __report_access(struct callback_head *work) { struct access_report_info *info = container_of(work, struct access_report_info, work); char *target_cmd, *agent_cmd; target_cmd = kstrdup_quotable_cmdline(info->target, GFP_KERNEL); agent_cmd = kstrdup_quotable_cmdline(info->agent, GFP_KERNEL); pr_notice_ratelimited( "ptrace %s of \"%s\"[%d] was attempted by \"%s\"[%d]\n", info->access, target_cmd, info->target->pid, agent_cmd, info->agent->pid); kfree(agent_cmd); kfree(target_cmd); put_task_struct(info->agent); put_task_struct(info->target); kfree(info); } /* defers execution because cmdline access can sleep */ static void report_access(const char *access, struct task_struct *target, struct task_struct *agent) { struct access_report_info *info; assert_spin_locked(&target->alloc_lock); /* for target->comm */ if (current->flags & PF_KTHREAD) { /* I don't think kthreads call task_work_run() before exiting. * Imagine angry ranting about procfs here. */ pr_notice_ratelimited( "ptrace %s of \"%s\"[%d] was attempted by \"%s\"[%d]\n", access, target->comm, target->pid, agent->comm, agent->pid); return; } info = kmalloc(sizeof(*info), GFP_ATOMIC); if (!info) return; init_task_work(&info->work, __report_access); get_task_struct(target); get_task_struct(agent); info->access = access; info->target = target; info->agent = agent; if (task_work_add(current, &info->work, TWA_RESUME) == 0) return; /* success */ WARN(1, "report_access called from exiting task"); put_task_struct(target); put_task_struct(agent); kfree(info); } /** * yama_relation_cleanup - remove invalid entries from the relation list * @work: unused * */ static void yama_relation_cleanup(struct work_struct *work) { struct ptrace_relation *relation; spin_lock(&ptracer_relations_lock); rcu_read_lock(); list_for_each_entry_rcu(relation, &ptracer_relations, node) { if (relation->invalid) { list_del_rcu(&relation->node); kfree_rcu(relation, rcu); } } rcu_read_unlock(); spin_unlock(&ptracer_relations_lock); } /** * yama_ptracer_add - add/replace an exception for this tracer/tracee pair * @tracer: the task_struct of the process doing the ptrace * @tracee: the task_struct of the process to be ptraced * * Each tracee can have, at most, one tracer registered. Each time this * is called, the prior registered tracer will be replaced for the tracee. * * Returns 0 if relationship was added, -ve on error. */ static int yama_ptracer_add(struct task_struct *tracer, struct task_struct *tracee) { struct ptrace_relation *relation, *added; added = kmalloc(sizeof(*added), GFP_KERNEL); if (!added) return -ENOMEM; added->tracee = tracee; added->tracer = tracer; added->invalid = false; spin_lock(&ptracer_relations_lock); rcu_read_lock(); list_for_each_entry_rcu(relation, &ptracer_relations, node) { if (relation->invalid) continue; if (relation->tracee == tracee) { list_replace_rcu(&relation->node, &added->node); kfree_rcu(relation, rcu); goto out; } } list_add_rcu(&added->node, &ptracer_relations); out: rcu_read_unlock(); spin_unlock(&ptracer_relations_lock); return 0; } /** * yama_ptracer_del - remove exceptions related to the given tasks * @tracer: remove any relation where tracer task matches * @tracee: remove any relation where tracee task matches */ static void yama_ptracer_del(struct task_struct *tracer, struct task_struct *tracee) { struct ptrace_relation *relation; bool marked = false; rcu_read_lock(); list_for_each_entry_rcu(relation, &ptracer_relations, node) { if (relation->invalid) continue; if (relation->tracee == tracee || (tracer && relation->tracer == tracer)) { relation->invalid = true; marked = true; } } rcu_read_unlock(); if (marked) schedule_work(&yama_relation_work); } /** * yama_task_free - check for task_pid to remove from exception list * @task: task being removed */ static void yama_task_free(struct task_struct *task) { yama_ptracer_del(task, task); } /** * yama_task_prctl - check for Yama-specific prctl operations * @option: operation * @arg2: argument * @arg3: argument * @arg4: argument * @arg5: argument * * Return 0 on success, -ve on error. -ENOSYS is returned when Yama * does not handle the given option. */ static int yama_task_prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) { int rc = -ENOSYS; struct task_struct *myself; switch (option) { case PR_SET_PTRACER: /* Since a thread can call prctl(), find the group leader * before calling _add() or _del() on it, since we want * process-level granularity of control. The tracer group * leader checking is handled later when walking the ancestry * at the time of PTRACE_ATTACH check. */ myself = current->group_leader; if (arg2 == 0) { yama_ptracer_del(NULL, myself); rc = 0; } else if (arg2 == PR_SET_PTRACER_ANY || (int)arg2 == -1) { rc = yama_ptracer_add(NULL, myself); } else { struct task_struct *tracer; tracer = find_get_task_by_vpid(arg2); if (!tracer) { rc = -EINVAL; } else { rc = yama_ptracer_add(tracer, myself); put_task_struct(tracer); } } break; } return rc; } /** * task_is_descendant - walk up a process family tree looking for a match * @parent: the process to compare against while walking up from child * @child: the process to start from while looking upwards for parent * * Returns 1 if child is a descendant of parent, 0 if not. */ static int task_is_descendant(struct task_struct *parent, struct task_struct *child) { int rc = 0; struct task_struct *walker = child; if (!parent || !child) return 0; rcu_read_lock(); if (!thread_group_leader(parent)) parent = rcu_dereference(parent->group_leader); while (walker->pid > 0) { if (!thread_group_leader(walker)) walker = rcu_dereference(walker->group_leader); if (walker == parent) { rc = 1; break; } walker = rcu_dereference(walker->real_parent); } rcu_read_unlock(); return rc; } /** * ptracer_exception_found - tracer registered as exception for this tracee * @tracer: the task_struct of the process attempting ptrace * @tracee: the task_struct of the process to be ptraced * * Returns 1 if tracer has a ptracer exception ancestor for tracee. */ static int ptracer_exception_found(struct task_struct *tracer, struct task_struct *tracee) { int rc = 0; struct ptrace_relation *relation; struct task_struct *parent = NULL; bool found = false; rcu_read_lock(); /* * If there's already an active tracing relationship, then make an * exception for the sake of other accesses, like process_vm_rw(). */ parent = ptrace_parent(tracee); if (parent != NULL && same_thread_group(parent, tracer)) { rc = 1; goto unlock; } /* Look for a PR_SET_PTRACER relationship. */ if (!thread_group_leader(tracee)) tracee = rcu_dereference(tracee->group_leader); list_for_each_entry_rcu(relation, &ptracer_relations, node) { if (relation->invalid) continue; if (relation->tracee == tracee) { parent = relation->tracer; found = true; break; } } if (found && (parent == NULL || task_is_descendant(parent, tracer))) rc = 1; unlock: rcu_read_unlock(); return rc; } /** * yama_ptrace_access_check - validate PTRACE_ATTACH calls * @child: task that current task is attempting to ptrace * @mode: ptrace attach mode * * Returns 0 if following the ptrace is allowed, -ve on error. */ static int yama_ptrace_access_check(struct task_struct *child, unsigned int mode) { int rc = 0; /* require ptrace target be a child of ptracer on attach */ if (mode & PTRACE_MODE_ATTACH) { switch (ptrace_scope) { case YAMA_SCOPE_DISABLED: /* No additional restrictions. */ break; case YAMA_SCOPE_RELATIONAL: rcu_read_lock(); if (!pid_alive(child)) rc = -EPERM; if (!rc && !task_is_descendant(current, child) && !ptracer_exception_found(current, child) && !ns_capable(__task_cred(child)->user_ns, CAP_SYS_PTRACE)) rc = -EPERM; rcu_read_unlock(); break; case YAMA_SCOPE_CAPABILITY: rcu_read_lock(); if (!ns_capable(__task_cred(child)->user_ns, CAP_SYS_PTRACE)) rc = -EPERM; rcu_read_unlock(); break; case YAMA_SCOPE_NO_ATTACH: default: rc = -EPERM; break; } } if (rc && (mode & PTRACE_MODE_NOAUDIT) == 0) report_access("attach", child, current); return rc; } /** * yama_ptrace_traceme - validate PTRACE_TRACEME calls * @parent: task that will become the ptracer of the current task * * Returns 0 if following the ptrace is allowed, -ve on error. */ static int yama_ptrace_traceme(struct task_struct *parent) { int rc = 0; /* Only disallow PTRACE_TRACEME on more aggressive settings. */ switch (ptrace_scope) { case YAMA_SCOPE_CAPABILITY: if (!has_ns_capability(parent, current_user_ns(), CAP_SYS_PTRACE)) rc = -EPERM; break; case YAMA_SCOPE_NO_ATTACH: rc = -EPERM; break; } if (rc) { task_lock(current); report_access("traceme", current, parent); task_unlock(current); } return rc; } static const struct lsm_id yama_lsmid = { .name = "yama", .id = LSM_ID_YAMA, }; static struct security_hook_list yama_hooks[] __ro_after_init = { LSM_HOOK_INIT(ptrace_access_check, yama_ptrace_access_check), LSM_HOOK_INIT(ptrace_traceme, yama_ptrace_traceme), LSM_HOOK_INIT(task_prctl, yama_task_prctl), LSM_HOOK_INIT(task_free, yama_task_free), }; #ifdef CONFIG_SYSCTL static int yama_dointvec_minmax(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table table_copy; if (write && !capable(CAP_SYS_PTRACE)) return -EPERM; /* Lock the max value if it ever gets set. */ table_copy = *table; if (*(int *)table_copy.data == *(int *)table_copy.extra2) table_copy.extra1 = table_copy.extra2; return proc_dointvec_minmax(&table_copy, write, buffer, lenp, ppos); } static int max_scope = YAMA_SCOPE_NO_ATTACH; static const struct ctl_table yama_sysctl_table[] = { { .procname = "ptrace_scope", .data = &ptrace_scope, .maxlen = sizeof(int), .mode = 0644, .proc_handler = yama_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &max_scope, }, }; static void __init yama_init_sysctl(void) { if (!register_sysctl("kernel/yama", yama_sysctl_table)) panic("Yama: sysctl registration failed.\n"); } #else static inline void yama_init_sysctl(void) { } #endif /* CONFIG_SYSCTL */ static int __init yama_init(void) { pr_info("Yama: becoming mindful.\n"); security_add_hooks(yama_hooks, ARRAY_SIZE(yama_hooks), &yama_lsmid); yama_init_sysctl(); return 0; } DEFINE_LSM(yama) = { .id = &yama_lsmid, .init = yama_init, };
6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 /* SPDX-License-Identifier: GPL-2.0 */ /* * Block data types and constants. Directly include this file only to * break include dependency loop. */ #ifndef __LINUX_BLK_TYPES_H #define __LINUX_BLK_TYPES_H #include <linux/types.h> #include <linux/bvec.h> #include <linux/device.h> #include <linux/ktime.h> #include <linux/rw_hint.h> struct bio_set; struct bio; struct bio_integrity_payload; struct page; struct io_context; struct cgroup_subsys_state; typedef void (bio_end_io_t) (struct bio *); struct bio_crypt_ctx; /* * The basic unit of block I/O is a sector. It is used in a number of contexts * in Linux (blk, bio, genhd). The size of one sector is 512 = 2**9 * bytes. Variables of type sector_t represent an offset or size that is a * multiple of 512 bytes. Hence these two constants. */ #ifndef SECTOR_SHIFT #define SECTOR_SHIFT 9 #endif #ifndef SECTOR_SIZE #define SECTOR_SIZE (1 << SECTOR_SHIFT) #endif #define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) #define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT) #define SECTOR_MASK (PAGE_SECTORS - 1) struct block_device { sector_t bd_start_sect; sector_t bd_nr_sectors; struct gendisk * bd_disk; struct request_queue * bd_queue; struct disk_stats __percpu *bd_stats; unsigned long bd_stamp; atomic_t __bd_flags; // partition number + flags #define BD_PARTNO 255 // lower 8 bits; assign-once #define BD_READ_ONLY (1u<<8) // read-only policy #define BD_WRITE_HOLDER (1u<<9) #define BD_HAS_SUBMIT_BIO (1u<<10) #define BD_RO_WARNED (1u<<11) #ifdef CONFIG_FAIL_MAKE_REQUEST #define BD_MAKE_IT_FAIL (1u<<12) #endif dev_t bd_dev; struct address_space *bd_mapping; /* page cache */ atomic_t bd_openers; spinlock_t bd_size_lock; /* for bd_inode->i_size updates */ void * bd_claiming; void * bd_holder; const struct blk_holder_ops *bd_holder_ops; struct mutex bd_holder_lock; int bd_holders; struct kobject *bd_holder_dir; atomic_t bd_fsfreeze_count; /* number of freeze requests */ struct mutex bd_fsfreeze_mutex; /* serialize freeze/thaw */ struct partition_meta_info *bd_meta_info; int bd_writers; #ifdef CONFIG_SECURITY void *bd_security; #endif /* * keep this out-of-line as it's both big and not needed in the fast * path */ struct device bd_device; } __randomize_layout; #define bdev_whole(_bdev) \ ((_bdev)->bd_disk->part0) #define dev_to_bdev(device) \ container_of((device), struct block_device, bd_device) #define bdev_kobj(_bdev) \ (&((_bdev)->bd_device.kobj)) /* * Block error status values. See block/blk-core:blk_errors for the details. */ typedef u8 __bitwise blk_status_t; typedef u16 blk_short_t; #define BLK_STS_OK 0 #define BLK_STS_NOTSUPP ((__force blk_status_t)1) #define BLK_STS_TIMEOUT ((__force blk_status_t)2) #define BLK_STS_NOSPC ((__force blk_status_t)3) #define BLK_STS_TRANSPORT ((__force blk_status_t)4) #define BLK_STS_TARGET ((__force blk_status_t)5) #define BLK_STS_RESV_CONFLICT ((__force blk_status_t)6) #define BLK_STS_MEDIUM ((__force blk_status_t)7) #define BLK_STS_PROTECTION ((__force blk_status_t)8) #define BLK_STS_RESOURCE ((__force blk_status_t)9) #define BLK_STS_IOERR ((__force blk_status_t)10) /* hack for device mapper, don't use elsewhere: */ #define BLK_STS_DM_REQUEUE ((__force blk_status_t)11) /* * BLK_STS_AGAIN should only be returned if RQF_NOWAIT is set * and the bio would block (cf bio_wouldblock_error()) */ #define BLK_STS_AGAIN ((__force blk_status_t)12) /* * BLK_STS_DEV_RESOURCE is returned from the driver to the block layer if * device related resources are unavailable, but the driver can guarantee * that the queue will be rerun in the future once resources become * available again. This is typically the case for device specific * resources that are consumed for IO. If the driver fails allocating these * resources, we know that inflight (or pending) IO will free these * resource upon completion. * * This is different from BLK_STS_RESOURCE in that it explicitly references * a device specific resource. For resources of wider scope, allocation * failure can happen without having pending IO. This means that we can't * rely on request completions freeing these resources, as IO may not be in * flight. Examples of that are kernel memory allocations, DMA mappings, or * any other system wide resources. */ #define BLK_STS_DEV_RESOURCE ((__force blk_status_t)13) /* * BLK_STS_ZONE_OPEN_RESOURCE is returned from the driver in the completion * path if the device returns a status indicating that too many zone resources * are currently open. The same command should be successful if resubmitted * after the number of open zones decreases below the device's limits, which is * reported in the request_queue's max_open_zones. */ #define BLK_STS_ZONE_OPEN_RESOURCE ((__force blk_status_t)14) /* * BLK_STS_ZONE_ACTIVE_RESOURCE is returned from the driver in the completion * path if the device returns a status indicating that too many zone resources * are currently active. The same command should be successful if resubmitted * after the number of active zones decreases below the device's limits, which * is reported in the request_queue's max_active_zones. */ #define BLK_STS_ZONE_ACTIVE_RESOURCE ((__force blk_status_t)15) /* * BLK_STS_OFFLINE is returned from the driver when the target device is offline * or is being taken offline. This could help differentiate the case where a * device is intentionally being shut down from a real I/O error. */ #define BLK_STS_OFFLINE ((__force blk_status_t)16) /* * BLK_STS_DURATION_LIMIT is returned from the driver when the target device * aborted the command because it exceeded one of its Command Duration Limits. */ #define BLK_STS_DURATION_LIMIT ((__force blk_status_t)17) /* * Invalid size or alignment. */ #define BLK_STS_INVAL ((__force blk_status_t)19) /** * blk_path_error - returns true if error may be path related * @error: status the request was completed with * * Description: * This classifies block error status into non-retryable errors and ones * that may be successful if retried on a failover path. * * Return: * %false - retrying failover path will not help * %true - may succeed if retried */ static inline bool blk_path_error(blk_status_t error) { switch (error) { case BLK_STS_NOTSUPP: case BLK_STS_NOSPC: case BLK_STS_TARGET: case BLK_STS_RESV_CONFLICT: case BLK_STS_MEDIUM: case BLK_STS_PROTECTION: return false; } /* Anything else could be a path failure, so should be retried */ return true; } typedef __u32 __bitwise blk_opf_t; typedef unsigned int blk_qc_t; #define BLK_QC_T_NONE -1U /* * main unit of I/O for the block layer and lower layers (ie drivers and * stacking drivers) */ struct bio { struct bio *bi_next; /* request queue link */ struct block_device *bi_bdev; blk_opf_t bi_opf; /* bottom bits REQ_OP, top bits * req_flags. */ unsigned short bi_flags; /* BIO_* below */ unsigned short bi_ioprio; enum rw_hint bi_write_hint; u8 bi_write_stream; blk_status_t bi_status; /* * The bvec gap bit indicates the lowest set bit in any address offset * between all bi_io_vecs. This field is initialized only after the bio * is split to the hardware limits (see bio_split_io_at()). The value * may be used to consider DMA optimization when performing that * mapping. The value is compared to a power of two mask where the * result depends on any bit set within the mask, so saving the lowest * bit is sufficient to know if any segment gap collides with the mask. */ u8 bi_bvec_gap_bit; atomic_t __bi_remaining; struct bvec_iter bi_iter; union { /* for polled bios: */ blk_qc_t bi_cookie; /* for plugged zoned writes only: */ unsigned int __bi_nr_segments; }; bio_end_io_t *bi_end_io; void *bi_private; #ifdef CONFIG_BLK_CGROUP /* * Represents the association of the css and request_queue for the bio. * If a bio goes direct to device, it will not have a blkg as it will * not have a request_queue associated with it. The reference is put * on release of the bio. */ struct blkcg_gq *bi_blkg; /* Time that this bio was issued. */ u64 issue_time_ns; #ifdef CONFIG_BLK_CGROUP_IOCOST u64 bi_iocost_cost; #endif #endif #ifdef CONFIG_BLK_INLINE_ENCRYPTION struct bio_crypt_ctx *bi_crypt_context; #endif #if defined(CONFIG_BLK_DEV_INTEGRITY) struct bio_integrity_payload *bi_integrity; /* data integrity */ #endif unsigned short bi_vcnt; /* how many bio_vec's */ /* * Everything starting with bi_max_vecs will be preserved by bio_reset() */ unsigned short bi_max_vecs; /* max bvl_vecs we can hold */ atomic_t __bi_cnt; /* pin count */ struct bio_vec *bi_io_vec; /* the actual vec list */ struct bio_set *bi_pool; }; #define BIO_RESET_BYTES offsetof(struct bio, bi_max_vecs) #define BIO_MAX_SECTORS (UINT_MAX >> SECTOR_SHIFT) static inline struct bio_vec *bio_inline_vecs(struct bio *bio) { return (struct bio_vec *)(bio + 1); } /* * bio flags */ enum { BIO_PAGE_PINNED, /* Unpin pages in bio_release_pages() */ BIO_CLONED, /* doesn't own data */ BIO_QUIET, /* Make BIO Quiet */ BIO_CHAIN, /* chained bio, ->bi_remaining in effect */ BIO_REFFED, /* bio has elevated ->bi_cnt */ BIO_BPS_THROTTLED, /* This bio has already been subjected to * throttling rules. Don't do it again. */ BIO_TRACE_COMPLETION, /* bio_endio() should trace the final completion * of this bio. */ BIO_CGROUP_ACCT, /* has been accounted to a cgroup */ BIO_QOS_THROTTLED, /* bio went through rq_qos throttle path */ /* * This bio has completed bps throttling at the single tg granularity, * which is different from BIO_BPS_THROTTLED. When the bio is enqueued * into the sq->queued of the upper tg, or is about to be dispatched, * this flag needs to be cleared. Since blk-throttle and rq_qos are not * on the same hierarchical level, reuse the value. */ BIO_TG_BPS_THROTTLED = BIO_QOS_THROTTLED, BIO_QOS_MERGED, /* but went through rq_qos merge path */ BIO_REMAPPED, BIO_ZONE_WRITE_PLUGGING, /* bio handled through zone write plugging */ BIO_EMULATES_ZONE_APPEND, /* bio emulates a zone append operation */ BIO_FLAG_LAST }; typedef __u32 __bitwise blk_mq_req_flags_t; #define REQ_OP_BITS 8 #define REQ_OP_MASK (__force blk_opf_t)((1 << REQ_OP_BITS) - 1) #define REQ_FLAG_BITS 24 /** * enum req_op - Operations common to the bio and request structures. * We use 8 bits for encoding the operation, and the remaining 24 for flags. * * The least significant bit of the operation number indicates the data * transfer direction: * * - if the least significant bit is set transfers are TO the device * - if the least significant bit is not set transfers are FROM the device * * If a operation does not transfer data the least significant bit has no * meaning. */ enum req_op { /* read sectors from the device */ REQ_OP_READ = (__force blk_opf_t)0, /* write sectors to the device */ REQ_OP_WRITE = (__force blk_opf_t)1, /* flush the volatile write cache */ REQ_OP_FLUSH = (__force blk_opf_t)2, /* discard sectors */ REQ_OP_DISCARD = (__force blk_opf_t)3, /* securely erase sectors */ REQ_OP_SECURE_ERASE = (__force blk_opf_t)5, /* write data at the current zone write pointer */ REQ_OP_ZONE_APPEND = (__force blk_opf_t)7, /* write the zero filled sector many times */ REQ_OP_WRITE_ZEROES = (__force blk_opf_t)9, /* Open a zone */ REQ_OP_ZONE_OPEN = (__force blk_opf_t)11, /* Close a zone */ REQ_OP_ZONE_CLOSE = (__force blk_opf_t)13, /* Transition a zone to full */ REQ_OP_ZONE_FINISH = (__force blk_opf_t)15, /* reset a zone write pointer */ REQ_OP_ZONE_RESET = (__force blk_opf_t)17, /* reset all the zone present on the device */ REQ_OP_ZONE_RESET_ALL = (__force blk_opf_t)19, /* Driver private requests */ REQ_OP_DRV_IN = (__force blk_opf_t)34, REQ_OP_DRV_OUT = (__force blk_opf_t)35, REQ_OP_LAST = (__force blk_opf_t)36, }; /* Keep cmd_flag_name[] in sync with the definitions below */ enum req_flag_bits { __REQ_FAILFAST_DEV = /* no driver retries of device errors */ REQ_OP_BITS, __REQ_FAILFAST_TRANSPORT, /* no driver retries of transport errors */ __REQ_FAILFAST_DRIVER, /* no driver retries of driver errors */ __REQ_SYNC, /* request is sync (sync write or read) */ __REQ_META, /* metadata io request */ __REQ_PRIO, /* boost priority in cfq */ __REQ_NOMERGE, /* don't touch this for merging */ __REQ_IDLE, /* anticipate more IO after this one */ __REQ_INTEGRITY, /* I/O includes block integrity payload */ __REQ_FUA, /* forced unit access */ __REQ_PREFLUSH, /* request for cache flush */ __REQ_RAHEAD, /* read ahead, can fail anytime */ __REQ_BACKGROUND, /* background IO */ __REQ_NOWAIT, /* Don't wait if request will block */ __REQ_POLLED, /* caller polls for completion using bio_poll */ __REQ_ALLOC_CACHE, /* allocate IO from cache if available */ __REQ_SWAP, /* swap I/O */ __REQ_DRV, /* for driver use */ __REQ_FS_PRIVATE, /* for file system (submitter) use */ __REQ_ATOMIC, /* for atomic write operations */ /* * Command specific flags, keep last: */ /* for REQ_OP_WRITE_ZEROES: */ __REQ_NOUNMAP, /* do not free blocks when zeroing */ __REQ_NR_BITS, /* stops here */ }; #define REQ_FAILFAST_DEV \ (__force blk_opf_t)(1ULL << __REQ_FAILFAST_DEV) #define REQ_FAILFAST_TRANSPORT \ (__force blk_opf_t)(1ULL << __REQ_FAILFAST_TRANSPORT) #define REQ_FAILFAST_DRIVER \ (__force blk_opf_t)(1ULL << __REQ_FAILFAST_DRIVER) #define REQ_SYNC (__force blk_opf_t)(1ULL << __REQ_SYNC) #define REQ_META (__force blk_opf_t)(1ULL << __REQ_META) #define REQ_PRIO (__force blk_opf_t)(1ULL << __REQ_PRIO) #define REQ_NOMERGE (__force blk_opf_t)(1ULL << __REQ_NOMERGE) #define REQ_IDLE (__force blk_opf_t)(1ULL << __REQ_IDLE) #define REQ_INTEGRITY (__force blk_opf_t)(1ULL << __REQ_INTEGRITY) #define REQ_FUA (__force blk_opf_t)(1ULL << __REQ_FUA) #define REQ_PREFLUSH (__force blk_opf_t)(1ULL << __REQ_PREFLUSH) #define REQ_RAHEAD (__force blk_opf_t)(1ULL << __REQ_RAHEAD) #define REQ_BACKGROUND (__force blk_opf_t)(1ULL << __REQ_BACKGROUND) #define REQ_NOWAIT (__force blk_opf_t)(1ULL << __REQ_NOWAIT) #define REQ_POLLED (__force blk_opf_t)(1ULL << __REQ_POLLED) #define REQ_ALLOC_CACHE (__force blk_opf_t)(1ULL << __REQ_ALLOC_CACHE) #define REQ_SWAP (__force blk_opf_t)(1ULL << __REQ_SWAP) #define REQ_DRV (__force blk_opf_t)(1ULL << __REQ_DRV) #define REQ_FS_PRIVATE (__force blk_opf_t)(1ULL << __REQ_FS_PRIVATE) #define REQ_ATOMIC (__force blk_opf_t)(1ULL << __REQ_ATOMIC) #define REQ_NOUNMAP (__force blk_opf_t)(1ULL << __REQ_NOUNMAP) #define REQ_FAILFAST_MASK \ (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) #define REQ_NOMERGE_FLAGS \ (REQ_NOMERGE | REQ_PREFLUSH | REQ_FUA) enum stat_group { STAT_READ, STAT_WRITE, STAT_DISCARD, STAT_FLUSH, NR_STAT_GROUPS }; static inline enum req_op bio_op(const struct bio *bio) { return bio->bi_opf & REQ_OP_MASK; } static inline bool op_is_write(blk_opf_t op) { return !!(op & (__force blk_opf_t)1); } /* * Check if the bio or request is one that needs special treatment in the * flush state machine. */ static inline bool op_is_flush(blk_opf_t op) { return op & (REQ_FUA | REQ_PREFLUSH); } /* * Reads are always treated as synchronous, as are requests with the FUA or * PREFLUSH flag. Other operations may be marked as synchronous using the * REQ_SYNC flag. */ static inline bool op_is_sync(blk_opf_t op) { return (op & REQ_OP_MASK) == REQ_OP_READ || (op & (REQ_SYNC | REQ_FUA | REQ_PREFLUSH)); } static inline bool op_is_discard(blk_opf_t op) { return (op & REQ_OP_MASK) == REQ_OP_DISCARD; } /* * Check if a bio or request operation is a zone management operation. */ static inline bool op_is_zone_mgmt(enum req_op op) { switch (op & REQ_OP_MASK) { case REQ_OP_ZONE_RESET: case REQ_OP_ZONE_RESET_ALL: case REQ_OP_ZONE_OPEN: case REQ_OP_ZONE_CLOSE: case REQ_OP_ZONE_FINISH: return true; default: return false; } } static inline int op_stat_group(enum req_op op) { if (op_is_discard(op)) return STAT_DISCARD; return op_is_write(op); } struct blk_rq_stat { u64 mean; u64 min; u64 max; u32 nr_samples; u64 batch; }; #endif /* __LINUX_BLK_TYPES_H */
8 8 2 2 34 34 9 35 35 35 8 8 34 34 34 34 7 3 2 2 8 8 4 8 8 1240 1245 190 190 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/core/netprio_cgroup.c Priority Control Group * * Authors: Neil Horman <nhorman@tuxdriver.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/cgroup.h> #include <linux/rcupdate.h> #include <linux/atomic.h> #include <linux/sched/task.h> #include <net/rtnetlink.h> #include <net/pkt_cls.h> #include <net/sock.h> #include <net/netprio_cgroup.h> #include <linux/fdtable.h> /* * netprio allocates per-net_device priomap array which is indexed by * css->id. Limiting css ID to 16bits doesn't lose anything. */ #define NETPRIO_ID_MAX USHRT_MAX #define PRIOMAP_MIN_SZ 128 /* * Extend @dev->priomap so that it's large enough to accommodate * @target_idx. @dev->priomap.priomap_len > @target_idx after successful * return. Must be called under rtnl lock. */ static int extend_netdev_table(struct net_device *dev, u32 target_idx) { struct netprio_map *old, *new; size_t new_sz, new_len; /* is the existing priomap large enough? */ old = rtnl_dereference(dev->priomap); if (old && old->priomap_len > target_idx) return 0; /* * Determine the new size. Let's keep it power-of-two. We start * from PRIOMAP_MIN_SZ and double it until it's large enough to * accommodate @target_idx. */ new_sz = PRIOMAP_MIN_SZ; while (true) { new_len = (new_sz - offsetof(struct netprio_map, priomap)) / sizeof(new->priomap[0]); if (new_len > target_idx) break; new_sz *= 2; /* overflowed? */ if (WARN_ON(new_sz < PRIOMAP_MIN_SZ)) return -ENOSPC; } /* allocate & copy */ new = kzalloc(new_sz, GFP_KERNEL); if (!new) return -ENOMEM; if (old) memcpy(new->priomap, old->priomap, old->priomap_len * sizeof(old->priomap[0])); new->priomap_len = new_len; /* install the new priomap */ rcu_assign_pointer(dev->priomap, new); if (old) kfree_rcu(old, rcu); return 0; } /** * netprio_prio - return the effective netprio of a cgroup-net_device pair * @css: css part of the target pair * @dev: net_device part of the target pair * * Should be called under RCU read or rtnl lock. */ static u32 netprio_prio(struct cgroup_subsys_state *css, struct net_device *dev) { struct netprio_map *map = rcu_dereference_rtnl(dev->priomap); int id = css->id; if (map && id < map->priomap_len) return map->priomap[id]; return 0; } /** * netprio_set_prio - set netprio on a cgroup-net_device pair * @css: css part of the target pair * @dev: net_device part of the target pair * @prio: prio to set * * Set netprio to @prio on @css-@dev pair. Should be called under rtnl * lock and may fail under memory pressure for non-zero @prio. */ static int netprio_set_prio(struct cgroup_subsys_state *css, struct net_device *dev, u32 prio) { struct netprio_map *map; int id = css->id; int ret; /* avoid extending priomap for zero writes */ map = rtnl_dereference(dev->priomap); if (!prio && (!map || map->priomap_len <= id)) return 0; ret = extend_netdev_table(dev, id); if (ret) return ret; map = rtnl_dereference(dev->priomap); map->priomap[id] = prio; return 0; } static struct cgroup_subsys_state * cgrp_css_alloc(struct cgroup_subsys_state *parent_css) { struct cgroup_subsys_state *css; css = kzalloc(sizeof(*css), GFP_KERNEL); if (!css) return ERR_PTR(-ENOMEM); return css; } static int cgrp_css_online(struct cgroup_subsys_state *css) { struct cgroup_subsys_state *parent_css = css->parent; struct net_device *dev; int ret = 0; if (css->id > NETPRIO_ID_MAX) return -ENOSPC; if (!parent_css) return 0; rtnl_lock(); /* * Inherit prios from the parent. As all prios are set during * onlining, there is no need to clear them on offline. */ for_each_netdev(&init_net, dev) { u32 prio = netprio_prio(parent_css, dev); ret = netprio_set_prio(css, dev, prio); if (ret) break; } rtnl_unlock(); return ret; } static void cgrp_css_free(struct cgroup_subsys_state *css) { kfree(css); } static u64 read_prioidx(struct cgroup_subsys_state *css, struct cftype *cft) { return css->id; } static int read_priomap(struct seq_file *sf, void *v) { struct net_device *dev; rcu_read_lock(); for_each_netdev_rcu(&init_net, dev) seq_printf(sf, "%s %u\n", dev->name, netprio_prio(seq_css(sf), dev)); rcu_read_unlock(); return 0; } static ssize_t write_priomap(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { char devname[IFNAMSIZ + 1]; struct net_device *dev; u32 prio; int ret; if (sscanf(buf, "%"__stringify(IFNAMSIZ)"s %u", devname, &prio) != 2) return -EINVAL; dev = dev_get_by_name(&init_net, devname); if (!dev) return -ENODEV; rtnl_lock(); ret = netprio_set_prio(of_css(of), dev, prio); rtnl_unlock(); dev_put(dev); return ret ?: nbytes; } static int update_netprio(const void *v, struct file *file, unsigned n) { struct socket *sock = sock_from_file(file); if (sock) sock_cgroup_set_prioidx(&sock->sk->sk_cgrp_data, (unsigned long)v); return 0; } static void net_prio_attach(struct cgroup_taskset *tset) { struct task_struct *p; struct cgroup_subsys_state *css; cgroup_taskset_for_each(p, css, tset) { void *v = (void *)(unsigned long)css->id; task_lock(p); iterate_fd(p->files, 0, update_netprio, v); task_unlock(p); } } static struct cftype ss_files[] = { { .name = "prioidx", .read_u64 = read_prioidx, }, { .name = "ifpriomap", .seq_show = read_priomap, .write = write_priomap, }, { } /* terminate */ }; struct cgroup_subsys net_prio_cgrp_subsys = { .css_alloc = cgrp_css_alloc, .css_online = cgrp_css_online, .css_free = cgrp_css_free, .attach = net_prio_attach, .legacy_cftypes = ss_files, }; static int netprio_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct netprio_map *old; /* * Note this is called with rtnl_lock held so we have update side * protection on our rcu assignments */ switch (event) { case NETDEV_UNREGISTER: old = rtnl_dereference(dev->priomap); RCU_INIT_POINTER(dev->priomap, NULL); if (old) kfree_rcu(old, rcu); break; } return NOTIFY_DONE; } static struct notifier_block netprio_device_notifier = { .notifier_call = netprio_device_event }; static int __init init_cgroup_netprio(void) { register_netdevice_notifier(&netprio_device_notifier); return 0; } subsys_initcall(init_cgroup_netprio);
12067 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_JUMP_LABEL_H #define _LINUX_JUMP_LABEL_H /* * Jump label support * * Copyright (C) 2009-2012 Jason Baron <jbaron@redhat.com> * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra * * DEPRECATED API: * * The use of 'struct static_key' directly, is now DEPRECATED. In addition * static_key_{true,false}() is also DEPRECATED. IE DO NOT use the following: * * struct static_key false = STATIC_KEY_INIT_FALSE; * struct static_key true = STATIC_KEY_INIT_TRUE; * static_key_true() * static_key_false() * * The updated API replacements are: * * DEFINE_STATIC_KEY_TRUE(key); * DEFINE_STATIC_KEY_FALSE(key); * DEFINE_STATIC_KEY_ARRAY_TRUE(keys, count); * DEFINE_STATIC_KEY_ARRAY_FALSE(keys, count); * static_branch_likely() * static_branch_unlikely() * * Jump labels provide an interface to generate dynamic branches using * self-modifying code. Assuming toolchain and architecture support, if we * define a "key" that is initially false via "DEFINE_STATIC_KEY_FALSE(key)", * an "if (static_branch_unlikely(&key))" statement is an unconditional branch * (which defaults to false - and the true block is placed out of line). * Similarly, we can define an initially true key via * "DEFINE_STATIC_KEY_TRUE(key)", and use it in the same * "if (static_branch_unlikely(&key))", in which case we will generate an * unconditional branch to the out-of-line true branch. Keys that are * initially true or false can be using in both static_branch_unlikely() * and static_branch_likely() statements. * * At runtime we can change the branch target by setting the key * to true via a call to static_branch_enable(), or false using * static_branch_disable(). If the direction of the branch is switched by * these calls then we run-time modify the branch target via a * no-op -> jump or jump -> no-op conversion. For example, for an * initially false key that is used in an "if (static_branch_unlikely(&key))" * statement, setting the key to true requires us to patch in a jump * to the out-of-line of true branch. * * In addition to static_branch_{enable,disable}, we can also reference count * the key or branch direction via static_branch_{inc,dec}. Thus, * static_branch_inc() can be thought of as a 'make more true' and * static_branch_dec() as a 'make more false'. * * Since this relies on modifying code, the branch modifying functions * must be considered absolute slow paths (machine wide synchronization etc.). * OTOH, since the affected branches are unconditional, their runtime overhead * will be absolutely minimal, esp. in the default (off) case where the total * effect is a single NOP of appropriate size. The on case will patch in a jump * to the out-of-line block. * * When the control is directly exposed to userspace, it is prudent to delay the * decrement to avoid high frequency code modifications which can (and do) * cause significant performance degradation. Struct static_key_deferred and * static_key_slow_dec_deferred() provide for this. * * Lacking toolchain and or architecture support, static keys fall back to a * simple conditional branch. * * Additional babbling in: Documentation/staging/static-keys.rst */ #ifndef __ASSEMBLY__ #include <linux/types.h> #include <linux/compiler.h> #include <linux/cleanup.h> extern bool static_key_initialized; #define STATIC_KEY_CHECK_USE(key) WARN(!static_key_initialized, \ "%s(): static key '%pS' used before call to jump_label_init()", \ __func__, (key)) struct static_key { atomic_t enabled; #ifdef CONFIG_JUMP_LABEL /* * Note: * To make anonymous unions work with old compilers, the static * initialization of them requires brackets. This creates a dependency * on the order of the struct with the initializers. If any fields * are added, STATIC_KEY_INIT_TRUE and STATIC_KEY_INIT_FALSE may need * to be modified. * * bit 0 => 1 if key is initially true * 0 if initially false * bit 1 => 1 if points to struct static_key_mod * 0 if points to struct jump_entry */ union { unsigned long type; struct jump_entry *entries; struct static_key_mod *next; }; #endif /* CONFIG_JUMP_LABEL */ }; #endif /* __ASSEMBLY__ */ #ifdef CONFIG_JUMP_LABEL #include <asm/jump_label.h> #ifndef __ASSEMBLY__ #ifdef CONFIG_HAVE_ARCH_JUMP_LABEL_RELATIVE struct jump_entry { s32 code; s32 target; long key; // key may be far away from the core kernel under KASLR }; static inline unsigned long jump_entry_code(const struct jump_entry *entry) { return (unsigned long)&entry->code + entry->code; } static inline unsigned long jump_entry_target(const struct jump_entry *entry) { return (unsigned long)&entry->target + entry->target; } static inline struct static_key *jump_entry_key(const struct jump_entry *entry) { long offset = entry->key & ~3L; return (struct static_key *)((unsigned long)&entry->key + offset); } #else static inline unsigned long jump_entry_code(const struct jump_entry *entry) { return entry->code; } static inline unsigned long jump_entry_target(const struct jump_entry *entry) { return entry->target; } static inline struct static_key *jump_entry_key(const struct jump_entry *entry) { return (struct static_key *)((unsigned long)entry->key & ~3UL); } #endif static inline bool jump_entry_is_branch(const struct jump_entry *entry) { return (unsigned long)entry->key & 1UL; } static inline bool jump_entry_is_init(const struct jump_entry *entry) { return (unsigned long)entry->key & 2UL; } static inline void jump_entry_set_init(struct jump_entry *entry, bool set) { if (set) entry->key |= 2; else entry->key &= ~2; } static inline int jump_entry_size(struct jump_entry *entry) { #ifdef JUMP_LABEL_NOP_SIZE return JUMP_LABEL_NOP_SIZE; #else return arch_jump_entry_size(entry); #endif } #endif #endif #ifndef __ASSEMBLY__ enum jump_label_type { JUMP_LABEL_NOP = 0, JUMP_LABEL_JMP, }; struct module; #ifdef CONFIG_JUMP_LABEL #define JUMP_TYPE_FALSE 0UL #define JUMP_TYPE_TRUE 1UL #define JUMP_TYPE_LINKED 2UL #define JUMP_TYPE_MASK 3UL static __always_inline bool static_key_false(struct static_key *key) { return arch_static_branch(key, false); } static __always_inline bool static_key_true(struct static_key *key) { return !arch_static_branch(key, true); } extern struct jump_entry __start___jump_table[]; extern struct jump_entry __stop___jump_table[]; extern void jump_label_init(void); extern void jump_label_init_ro(void); extern void jump_label_lock(void); extern void jump_label_unlock(void); extern void arch_jump_label_transform(struct jump_entry *entry, enum jump_label_type type); extern bool arch_jump_label_transform_queue(struct jump_entry *entry, enum jump_label_type type); extern void arch_jump_label_transform_apply(void); extern int jump_label_text_reserved(void *start, void *end); extern bool static_key_slow_inc(struct static_key *key); extern bool static_key_fast_inc_not_disabled(struct static_key *key); extern void static_key_slow_dec(struct static_key *key); extern bool static_key_slow_inc_cpuslocked(struct static_key *key); extern void static_key_slow_dec_cpuslocked(struct static_key *key); extern int static_key_count(struct static_key *key); extern void static_key_enable(struct static_key *key); extern void static_key_disable(struct static_key *key); extern void static_key_enable_cpuslocked(struct static_key *key); extern void static_key_disable_cpuslocked(struct static_key *key); extern enum jump_label_type jump_label_init_type(struct jump_entry *entry); /* * We should be using ATOMIC_INIT() for initializing .enabled, but * the inclusion of atomic.h is problematic for inclusion of jump_label.h * in 'low-level' headers. Thus, we are initializing .enabled with a * raw value, but have added a BUILD_BUG_ON() to catch any issues in * jump_label_init() see: kernel/jump_label.c. */ #define STATIC_KEY_INIT_TRUE \ { .enabled = { 1 }, \ { .type = JUMP_TYPE_TRUE } } #define STATIC_KEY_INIT_FALSE \ { .enabled = { 0 }, \ { .type = JUMP_TYPE_FALSE } } #else /* !CONFIG_JUMP_LABEL */ #include <linux/atomic.h> #include <linux/bug.h> static __always_inline int static_key_count(struct static_key *key) { return raw_atomic_read(&key->enabled); } static __always_inline void jump_label_init(void) { static_key_initialized = true; } static __always_inline void jump_label_init_ro(void) { } static __always_inline bool static_key_false(struct static_key *key) { if (unlikely_notrace(static_key_count(key) > 0)) return true; return false; } static __always_inline bool static_key_true(struct static_key *key) { if (likely_notrace(static_key_count(key) > 0)) return true; return false; } static inline bool static_key_fast_inc_not_disabled(struct static_key *key) { int v; STATIC_KEY_CHECK_USE(key); /* * Prevent key->enabled getting negative to follow the same semantics * as for CONFIG_JUMP_LABEL=y, see kernel/jump_label.c comment. */ v = atomic_read(&key->enabled); do { if (v < 0 || (v + 1) < 0) return false; } while (!likely(atomic_try_cmpxchg(&key->enabled, &v, v + 1))); return true; } #define static_key_slow_inc(key) static_key_fast_inc_not_disabled(key) static inline void static_key_slow_dec(struct static_key *key) { STATIC_KEY_CHECK_USE(key); atomic_dec(&key->enabled); } #define static_key_slow_inc_cpuslocked(key) static_key_slow_inc(key) #define static_key_slow_dec_cpuslocked(key) static_key_slow_dec(key) static inline int jump_label_text_reserved(void *start, void *end) { return 0; } static inline void jump_label_lock(void) {} static inline void jump_label_unlock(void) {} static inline void static_key_enable(struct static_key *key) { STATIC_KEY_CHECK_USE(key); if (atomic_read(&key->enabled) != 0) { WARN_ON_ONCE(atomic_read(&key->enabled) != 1); return; } atomic_set(&key->enabled, 1); } static inline void static_key_disable(struct static_key *key) { STATIC_KEY_CHECK_USE(key); if (atomic_read(&key->enabled) != 1) { WARN_ON_ONCE(atomic_read(&key->enabled) != 0); return; } atomic_set(&key->enabled, 0); } #define static_key_enable_cpuslocked(k) static_key_enable((k)) #define static_key_disable_cpuslocked(k) static_key_disable((k)) #define STATIC_KEY_INIT_TRUE { .enabled = ATOMIC_INIT(1) } #define STATIC_KEY_INIT_FALSE { .enabled = ATOMIC_INIT(0) } #endif /* CONFIG_JUMP_LABEL */ DEFINE_LOCK_GUARD_0(jump_label_lock, jump_label_lock(), jump_label_unlock()) #define STATIC_KEY_INIT STATIC_KEY_INIT_FALSE #define jump_label_enabled static_key_enabled /* -------------------------------------------------------------------------- */ /* * Two type wrappers around static_key, such that we can use compile time * type differentiation to emit the right code. * * All the below code is macros in order to play type games. */ struct static_key_true { struct static_key key; }; struct static_key_false { struct static_key key; }; #define STATIC_KEY_TRUE_INIT (struct static_key_true) { .key = STATIC_KEY_INIT_TRUE, } #define STATIC_KEY_FALSE_INIT (struct static_key_false){ .key = STATIC_KEY_INIT_FALSE, } #define DEFINE_STATIC_KEY_TRUE(name) \ struct static_key_true name = STATIC_KEY_TRUE_INIT #define DEFINE_STATIC_KEY_TRUE_RO(name) \ struct static_key_true name __ro_after_init = STATIC_KEY_TRUE_INIT #define DECLARE_STATIC_KEY_TRUE(name) \ extern struct static_key_true name #define DEFINE_STATIC_KEY_FALSE(name) \ struct static_key_false name = STATIC_KEY_FALSE_INIT #define DEFINE_STATIC_KEY_FALSE_RO(name) \ struct static_key_false name __ro_after_init = STATIC_KEY_FALSE_INIT #define DECLARE_STATIC_KEY_FALSE(name) \ extern struct static_key_false name #define DEFINE_STATIC_KEY_ARRAY_TRUE(name, count) \ struct static_key_true name[count] = { \ [0 ... (count) - 1] = STATIC_KEY_TRUE_INIT, \ } #define DEFINE_STATIC_KEY_ARRAY_FALSE(name, count) \ struct static_key_false name[count] = { \ [0 ... (count) - 1] = STATIC_KEY_FALSE_INIT, \ } #define _DEFINE_STATIC_KEY_1(name) DEFINE_STATIC_KEY_TRUE(name) #define _DEFINE_STATIC_KEY_0(name) DEFINE_STATIC_KEY_FALSE(name) #define DEFINE_STATIC_KEY_MAYBE(cfg, name) \ __PASTE(_DEFINE_STATIC_KEY_, IS_ENABLED(cfg))(name) #define _DEFINE_STATIC_KEY_RO_1(name) DEFINE_STATIC_KEY_TRUE_RO(name) #define _DEFINE_STATIC_KEY_RO_0(name) DEFINE_STATIC_KEY_FALSE_RO(name) #define DEFINE_STATIC_KEY_MAYBE_RO(cfg, name) \ __PASTE(_DEFINE_STATIC_KEY_RO_, IS_ENABLED(cfg))(name) #define _DECLARE_STATIC_KEY_1(name) DECLARE_STATIC_KEY_TRUE(name) #define _DECLARE_STATIC_KEY_0(name) DECLARE_STATIC_KEY_FALSE(name) #define DECLARE_STATIC_KEY_MAYBE(cfg, name) \ __PASTE(_DECLARE_STATIC_KEY_, IS_ENABLED(cfg))(name) extern bool ____wrong_branch_error(void); #define static_key_enabled(x) \ ({ \ if (!__builtin_types_compatible_p(typeof(*x), struct static_key) && \ !__builtin_types_compatible_p(typeof(*x), struct static_key_true) &&\ !__builtin_types_compatible_p(typeof(*x), struct static_key_false)) \ ____wrong_branch_error(); \ static_key_count((struct static_key *)x) > 0; \ }) #ifdef CONFIG_JUMP_LABEL /* * Combine the right initial value (type) with the right branch order * to generate the desired result. * * * type\branch| likely (1) | unlikely (0) * -----------+-----------------------+------------------ * | | * true (1) | ... | ... * | NOP | JMP L * | <br-stmts> | 1: ... * | L: ... | * | | * | | L: <br-stmts> * | | jmp 1b * | | * -----------+-----------------------+------------------ * | | * false (0) | ... | ... * | JMP L | NOP * | <br-stmts> | 1: ... * | L: ... | * | | * | | L: <br-stmts> * | | jmp 1b * | | * -----------+-----------------------+------------------ * * The initial value is encoded in the LSB of static_key::entries, * type: 0 = false, 1 = true. * * The branch type is encoded in the LSB of jump_entry::key, * branch: 0 = unlikely, 1 = likely. * * This gives the following logic table: * * enabled type branch instuction * -----------------------------+----------- * 0 0 0 | NOP * 0 0 1 | JMP * 0 1 0 | NOP * 0 1 1 | JMP * * 1 0 0 | JMP * 1 0 1 | NOP * 1 1 0 | JMP * 1 1 1 | NOP * * Which gives the following functions: * * dynamic: instruction = enabled ^ branch * static: instruction = type ^ branch * * See jump_label_type() / jump_label_init_type(). */ #define static_branch_likely(x) \ ({ \ bool branch; \ if (__builtin_types_compatible_p(typeof(*x), struct static_key_true)) \ branch = !arch_static_branch(&(x)->key, true); \ else if (__builtin_types_compatible_p(typeof(*x), struct static_key_false)) \ branch = !arch_static_branch_jump(&(x)->key, true); \ else \ branch = ____wrong_branch_error(); \ likely_notrace(branch); \ }) #define static_branch_unlikely(x) \ ({ \ bool branch; \ if (__builtin_types_compatible_p(typeof(*x), struct static_key_true)) \ branch = arch_static_branch_jump(&(x)->key, false); \ else if (__builtin_types_compatible_p(typeof(*x), struct static_key_false)) \ branch = arch_static_branch(&(x)->key, false); \ else \ branch = ____wrong_branch_error(); \ unlikely_notrace(branch); \ }) #else /* !CONFIG_JUMP_LABEL */ #define static_branch_likely(x) likely_notrace(static_key_enabled(&(x)->key)) #define static_branch_unlikely(x) unlikely_notrace(static_key_enabled(&(x)->key)) #endif /* CONFIG_JUMP_LABEL */ #define static_branch_maybe(config, x) \ (IS_ENABLED(config) ? static_branch_likely(x) \ : static_branch_unlikely(x)) /* * Advanced usage; refcount, branch is enabled when: count != 0 */ #define static_branch_inc(x) static_key_slow_inc(&(x)->key) #define static_branch_dec(x) static_key_slow_dec(&(x)->key) #define static_branch_inc_cpuslocked(x) static_key_slow_inc_cpuslocked(&(x)->key) #define static_branch_dec_cpuslocked(x) static_key_slow_dec_cpuslocked(&(x)->key) /* * Normal usage; boolean enable/disable. */ #define static_branch_enable(x) static_key_enable(&(x)->key) #define static_branch_disable(x) static_key_disable(&(x)->key) #define static_branch_enable_cpuslocked(x) static_key_enable_cpuslocked(&(x)->key) #define static_branch_disable_cpuslocked(x) static_key_disable_cpuslocked(&(x)->key) #endif /* __ASSEMBLY__ */ #endif /* _LINUX_JUMP_LABEL_H */
102 14 78 457 455 455 56 457 456 42 414 413 384 78 415 415 414 69 69 68 68 68 21 21 5 5 5 5 4 42 42 41 3 21 21 42 42 3 41 41 41 40 41 41 303 4 306 304 14 299 6538 6469 304 306 306 306 305 102 101 102 102 102 14 14 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2013 Red Hat, Inc. and Parallels Inc. All rights reserved. * Authors: David Chinner and Glauber Costa * * Generic LRU infrastructure */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/mm.h> #include <linux/list_lru.h> #include <linux/slab.h> #include <linux/mutex.h> #include <linux/memcontrol.h> #include "slab.h" #include "internal.h" #ifdef CONFIG_MEMCG static LIST_HEAD(memcg_list_lrus); static DEFINE_MUTEX(list_lrus_mutex); static inline bool list_lru_memcg_aware(struct list_lru *lru) { return lru->memcg_aware; } static void list_lru_register(struct list_lru *lru) { if (!list_lru_memcg_aware(lru)) return; mutex_lock(&list_lrus_mutex); list_add(&lru->list, &memcg_list_lrus); mutex_unlock(&list_lrus_mutex); } static void list_lru_unregister(struct list_lru *lru) { if (!list_lru_memcg_aware(lru)) return; mutex_lock(&list_lrus_mutex); list_del(&lru->list); mutex_unlock(&list_lrus_mutex); } static int lru_shrinker_id(struct list_lru *lru) { return lru->shrinker_id; } static inline struct list_lru_one * list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) { if (list_lru_memcg_aware(lru) && idx >= 0) { struct list_lru_memcg *mlru = xa_load(&lru->xa, idx); return mlru ? &mlru->node[nid] : NULL; } return &lru->node[nid].lru; } static inline bool lock_list_lru(struct list_lru_one *l, bool irq) { if (irq) spin_lock_irq(&l->lock); else spin_lock(&l->lock); if (unlikely(READ_ONCE(l->nr_items) == LONG_MIN)) { if (irq) spin_unlock_irq(&l->lock); else spin_unlock(&l->lock); return false; } return true; } static inline struct list_lru_one * lock_list_lru_of_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg, bool irq, bool skip_empty) { struct list_lru_one *l; rcu_read_lock(); again: l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); if (likely(l) && lock_list_lru(l, irq)) { rcu_read_unlock(); return l; } /* * Caller may simply bail out if raced with reparenting or * may iterate through the list_lru and expect empty slots. */ if (skip_empty) { rcu_read_unlock(); return NULL; } VM_WARN_ON(!css_is_dying(&memcg->css)); memcg = parent_mem_cgroup(memcg); goto again; } static inline void unlock_list_lru(struct list_lru_one *l, bool irq_off) { if (irq_off) spin_unlock_irq(&l->lock); else spin_unlock(&l->lock); } #else static void list_lru_register(struct list_lru *lru) { } static void list_lru_unregister(struct list_lru *lru) { } static int lru_shrinker_id(struct list_lru *lru) { return -1; } static inline bool list_lru_memcg_aware(struct list_lru *lru) { return false; } static inline struct list_lru_one * list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) { return &lru->node[nid].lru; } static inline struct list_lru_one * lock_list_lru_of_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg, bool irq, bool skip_empty) { struct list_lru_one *l = &lru->node[nid].lru; if (irq) spin_lock_irq(&l->lock); else spin_lock(&l->lock); return l; } static inline void unlock_list_lru(struct list_lru_one *l, bool irq_off) { if (irq_off) spin_unlock_irq(&l->lock); else spin_unlock(&l->lock); } #endif /* CONFIG_MEMCG */ /* The caller must ensure the memcg lifetime. */ bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, struct mem_cgroup *memcg) { struct list_lru_node *nlru = &lru->node[nid]; struct list_lru_one *l; l = lock_list_lru_of_memcg(lru, nid, memcg, false, false); if (!l) return false; if (list_empty(item)) { list_add_tail(item, &l->list); /* Set shrinker bit if the first element was added */ if (!l->nr_items++) set_shrinker_bit(memcg, nid, lru_shrinker_id(lru)); unlock_list_lru(l, false); atomic_long_inc(&nlru->nr_items); return true; } unlock_list_lru(l, false); return false; } bool list_lru_add_obj(struct list_lru *lru, struct list_head *item) { bool ret; int nid = page_to_nid(virt_to_page(item)); if (list_lru_memcg_aware(lru)) { rcu_read_lock(); ret = list_lru_add(lru, item, nid, mem_cgroup_from_slab_obj(item)); rcu_read_unlock(); } else { ret = list_lru_add(lru, item, nid, NULL); } return ret; } EXPORT_SYMBOL_GPL(list_lru_add_obj); /* The caller must ensure the memcg lifetime. */ bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid, struct mem_cgroup *memcg) { struct list_lru_node *nlru = &lru->node[nid]; struct list_lru_one *l; l = lock_list_lru_of_memcg(lru, nid, memcg, false, false); if (!l) return false; if (!list_empty(item)) { list_del_init(item); l->nr_items--; unlock_list_lru(l, false); atomic_long_dec(&nlru->nr_items); return true; } unlock_list_lru(l, false); return false; } bool list_lru_del_obj(struct list_lru *lru, struct list_head *item) { bool ret; int nid = page_to_nid(virt_to_page(item)); if (list_lru_memcg_aware(lru)) { rcu_read_lock(); ret = list_lru_del(lru, item, nid, mem_cgroup_from_slab_obj(item)); rcu_read_unlock(); } else { ret = list_lru_del(lru, item, nid, NULL); } return ret; } EXPORT_SYMBOL_GPL(list_lru_del_obj); void list_lru_isolate(struct list_lru_one *list, struct list_head *item) { list_del_init(item); list->nr_items--; } EXPORT_SYMBOL_GPL(list_lru_isolate); void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item, struct list_head *head) { list_move(item, head); list->nr_items--; } EXPORT_SYMBOL_GPL(list_lru_isolate_move); unsigned long list_lru_count_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg) { struct list_lru_one *l; long count; rcu_read_lock(); l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); count = l ? READ_ONCE(l->nr_items) : 0; rcu_read_unlock(); if (unlikely(count < 0)) count = 0; return count; } EXPORT_SYMBOL_GPL(list_lru_count_one); unsigned long list_lru_count_node(struct list_lru *lru, int nid) { struct list_lru_node *nlru; nlru = &lru->node[nid]; return atomic_long_read(&nlru->nr_items); } EXPORT_SYMBOL_GPL(list_lru_count_node); static unsigned long __list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk, bool irq_off) { struct list_lru_node *nlru = &lru->node[nid]; struct list_lru_one *l = NULL; struct list_head *item, *n; unsigned long isolated = 0; restart: l = lock_list_lru_of_memcg(lru, nid, memcg, irq_off, true); if (!l) return isolated; list_for_each_safe(item, n, &l->list) { enum lru_status ret; /* * decrement nr_to_walk first so that we don't livelock if we * get stuck on large numbers of LRU_RETRY items */ if (!*nr_to_walk) break; --*nr_to_walk; ret = isolate(item, l, cb_arg); switch (ret) { /* * LRU_RETRY, LRU_REMOVED_RETRY and LRU_STOP will drop the lru * lock. List traversal will have to restart from scratch. */ case LRU_RETRY: goto restart; case LRU_REMOVED_RETRY: fallthrough; case LRU_REMOVED: isolated++; atomic_long_dec(&nlru->nr_items); if (ret == LRU_REMOVED_RETRY) goto restart; break; case LRU_ROTATE: list_move_tail(item, &l->list); break; case LRU_SKIP: break; case LRU_STOP: goto out; default: BUG(); } } unlock_list_lru(l, irq_off); out: return isolated; } unsigned long list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk) { return __list_lru_walk_one(lru, nid, memcg, isolate, cb_arg, nr_to_walk, false); } EXPORT_SYMBOL_GPL(list_lru_walk_one); unsigned long list_lru_walk_one_irq(struct list_lru *lru, int nid, struct mem_cgroup *memcg, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk) { return __list_lru_walk_one(lru, nid, memcg, isolate, cb_arg, nr_to_walk, true); } unsigned long list_lru_walk_node(struct list_lru *lru, int nid, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk) { long isolated = 0; isolated += list_lru_walk_one(lru, nid, NULL, isolate, cb_arg, nr_to_walk); #ifdef CONFIG_MEMCG if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) { struct list_lru_memcg *mlru; struct mem_cgroup *memcg; unsigned long index; xa_for_each(&lru->xa, index, mlru) { rcu_read_lock(); memcg = mem_cgroup_from_id(index); if (!mem_cgroup_tryget(memcg)) { rcu_read_unlock(); continue; } rcu_read_unlock(); isolated += __list_lru_walk_one(lru, nid, memcg, isolate, cb_arg, nr_to_walk, false); mem_cgroup_put(memcg); if (*nr_to_walk <= 0) break; } } #endif return isolated; } EXPORT_SYMBOL_GPL(list_lru_walk_node); static void init_one_lru(struct list_lru *lru, struct list_lru_one *l) { INIT_LIST_HEAD(&l->list); spin_lock_init(&l->lock); l->nr_items = 0; #ifdef CONFIG_LOCKDEP if (lru->key) lockdep_set_class(&l->lock, lru->key); #endif } #ifdef CONFIG_MEMCG static struct list_lru_memcg *memcg_init_list_lru_one(struct list_lru *lru, gfp_t gfp) { int nid; struct list_lru_memcg *mlru; mlru = kmalloc(struct_size(mlru, node, nr_node_ids), gfp); if (!mlru) return NULL; for_each_node(nid) init_one_lru(lru, &mlru->node[nid]); return mlru; } static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) { if (memcg_aware) xa_init_flags(&lru->xa, XA_FLAGS_LOCK_IRQ); lru->memcg_aware = memcg_aware; } static void memcg_destroy_list_lru(struct list_lru *lru) { XA_STATE(xas, &lru->xa, 0); struct list_lru_memcg *mlru; if (!list_lru_memcg_aware(lru)) return; xas_lock_irq(&xas); xas_for_each(&xas, mlru, ULONG_MAX) { kfree(mlru); xas_store(&xas, NULL); } xas_unlock_irq(&xas); } static void memcg_reparent_list_lru_one(struct list_lru *lru, int nid, struct list_lru_one *src, struct mem_cgroup *dst_memcg) { int dst_idx = dst_memcg->kmemcg_id; struct list_lru_one *dst; spin_lock_irq(&src->lock); dst = list_lru_from_memcg_idx(lru, nid, dst_idx); spin_lock_nested(&dst->lock, SINGLE_DEPTH_NESTING); list_splice_init(&src->list, &dst->list); if (src->nr_items) { WARN_ON(src->nr_items < 0); dst->nr_items += src->nr_items; set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru)); } /* Mark the list_lru_one dead */ src->nr_items = LONG_MIN; spin_unlock(&dst->lock); spin_unlock_irq(&src->lock); } void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *parent) { struct list_lru *lru; int i; mutex_lock(&list_lrus_mutex); list_for_each_entry(lru, &memcg_list_lrus, list) { struct list_lru_memcg *mlru; XA_STATE(xas, &lru->xa, memcg->kmemcg_id); /* * Lock the Xarray to ensure no on going list_lru_memcg * allocation and further allocation will see css_is_dying(). */ xas_lock_irq(&xas); mlru = xas_store(&xas, NULL); xas_unlock_irq(&xas); if (!mlru) continue; /* * With Xarray value set to NULL, holding the lru lock below * prevents list_lru_{add,del,isolate} from touching the lru, * safe to reparent. */ for_each_node(i) memcg_reparent_list_lru_one(lru, i, &mlru->node[i], parent); /* * Here all list_lrus corresponding to the cgroup are guaranteed * to remain empty, we can safely free this lru, any further * memcg_list_lru_alloc() call will simply bail out. */ kvfree_rcu(mlru, rcu); } mutex_unlock(&list_lrus_mutex); } static inline bool memcg_list_lru_allocated(struct mem_cgroup *memcg, struct list_lru *lru) { int idx = memcg->kmemcg_id; return idx < 0 || xa_load(&lru->xa, idx); } int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru, gfp_t gfp) { unsigned long flags; struct list_lru_memcg *mlru = NULL; struct mem_cgroup *pos, *parent; XA_STATE(xas, &lru->xa, 0); if (!list_lru_memcg_aware(lru) || memcg_list_lru_allocated(memcg, lru)) return 0; gfp &= GFP_RECLAIM_MASK; /* * Because the list_lru can be reparented to the parent cgroup's * list_lru, we should make sure that this cgroup and all its * ancestors have allocated list_lru_memcg. */ do { /* * Keep finding the farest parent that wasn't populated * until found memcg itself. */ pos = memcg; parent = parent_mem_cgroup(pos); while (!memcg_list_lru_allocated(parent, lru)) { pos = parent; parent = parent_mem_cgroup(pos); } if (!mlru) { mlru = memcg_init_list_lru_one(lru, gfp); if (!mlru) return -ENOMEM; } xas_set(&xas, pos->kmemcg_id); do { xas_lock_irqsave(&xas, flags); if (!xas_load(&xas) && !css_is_dying(&pos->css)) { xas_store(&xas, mlru); if (!xas_error(&xas)) mlru = NULL; } xas_unlock_irqrestore(&xas, flags); } while (xas_nomem(&xas, gfp)); } while (pos != memcg && !css_is_dying(&pos->css)); if (unlikely(mlru)) kfree(mlru); return xas_error(&xas); } #else static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) { } static void memcg_destroy_list_lru(struct list_lru *lru) { } #endif /* CONFIG_MEMCG */ int __list_lru_init(struct list_lru *lru, bool memcg_aware, struct shrinker *shrinker) { int i; #ifdef CONFIG_MEMCG if (shrinker) lru->shrinker_id = shrinker->id; else lru->shrinker_id = -1; if (mem_cgroup_kmem_disabled()) memcg_aware = false; #endif lru->node = kcalloc(nr_node_ids, sizeof(*lru->node), GFP_KERNEL); if (!lru->node) return -ENOMEM; for_each_node(i) init_one_lru(lru, &lru->node[i].lru); memcg_init_list_lru(lru, memcg_aware); list_lru_register(lru); return 0; } EXPORT_SYMBOL_GPL(__list_lru_init); void list_lru_destroy(struct list_lru *lru) { /* Already destroyed or not yet initialized? */ if (!lru->node) return; list_lru_unregister(lru); memcg_destroy_list_lru(lru); kfree(lru->node); lru->node = NULL; #ifdef CONFIG_MEMCG lru->shrinker_id = -1; #endif } EXPORT_SYMBOL_GPL(list_lru_destroy);
322 322 322 322 167 168 9 168 8 168 165 2 168 168 39 10 39 41 41 4 38 7 3 7 321 322 323 319 1 2 1 192 219 320 322 322 239 218 90 91 200 152 168 38 37 4 4 4 37 21 35 38 37 37 37 298 2 296 8 298 297 8 34 41 18 38 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001-2003 International Business Machines Corp. * Copyright (c) 2001 Intel Corp. * Copyright (c) 2001 La Monte H.P. Yarroll * * This file is part of the SCTP kernel implementation * * This module provides the abstraction for an SCTP transport representing * a remote transport address. For local transport addresses, we just use * union sctp_addr. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * Jon Grimm <jgrimm@us.ibm.com> * Xingang Guo <xingang.guo@intel.com> * Hui Huang <hui.huang@nokia.com> * Sridhar Samudrala <sri@us.ibm.com> * Ardelle Fan <ardelle.fan@intel.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/slab.h> #include <linux/types.h> #include <linux/random.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> /* 1st Level Abstractions. */ /* Initialize a new transport from provided memory. */ static void sctp_transport_init(struct net *net, struct sctp_transport *peer, const union sctp_addr *addr, gfp_t gfp) { /* Copy in the address. */ peer->af_specific = sctp_get_af_specific(addr->sa.sa_family); memcpy(&peer->ipaddr, addr, peer->af_specific->sockaddr_len); memset(&peer->saddr, 0, sizeof(union sctp_addr)); peer->sack_generation = 0; /* From 6.3.1 RTO Calculation: * * C1) Until an RTT measurement has been made for a packet sent to the * given destination transport address, set RTO to the protocol * parameter 'RTO.Initial'. */ peer->rto = msecs_to_jiffies(net->sctp.rto_initial); peer->last_time_heard = 0; peer->last_time_ecne_reduced = jiffies; peer->param_flags = SPP_HB_DISABLE | SPP_PMTUD_ENABLE | SPP_SACKDELAY_ENABLE; /* Initialize the default path max_retrans. */ peer->pathmaxrxt = net->sctp.max_retrans_path; peer->pf_retrans = net->sctp.pf_retrans; INIT_LIST_HEAD(&peer->transmitted); INIT_LIST_HEAD(&peer->send_ready); INIT_LIST_HEAD(&peer->transports); timer_setup(&peer->T3_rtx_timer, sctp_generate_t3_rtx_event, 0); timer_setup(&peer->hb_timer, sctp_generate_heartbeat_event, 0); timer_setup(&peer->reconf_timer, sctp_generate_reconf_event, 0); timer_setup(&peer->probe_timer, sctp_generate_probe_event, 0); timer_setup(&peer->proto_unreach_timer, sctp_generate_proto_unreach_event, 0); /* Initialize the 64-bit random nonce sent with heartbeat. */ get_random_bytes(&peer->hb_nonce, sizeof(peer->hb_nonce)); refcount_set(&peer->refcnt, 1); } /* Allocate and initialize a new transport. */ struct sctp_transport *sctp_transport_new(struct net *net, const union sctp_addr *addr, gfp_t gfp) { struct sctp_transport *transport; transport = kzalloc(sizeof(*transport), gfp); if (!transport) return NULL; sctp_transport_init(net, transport, addr, gfp); SCTP_DBG_OBJCNT_INC(transport); return transport; } /* This transport is no longer needed. Free up if possible, or * delay until it last reference count. */ void sctp_transport_free(struct sctp_transport *transport) { transport->dead = 1; /* Try to delete the heartbeat timer. */ if (timer_delete(&transport->hb_timer)) sctp_transport_put(transport); /* Delete the T3_rtx timer if it's active. * There is no point in not doing this now and letting * structure hang around in memory since we know * the transport is going away. */ if (timer_delete(&transport->T3_rtx_timer)) sctp_transport_put(transport); if (timer_delete(&transport->reconf_timer)) sctp_transport_put(transport); if (timer_delete(&transport->probe_timer)) sctp_transport_put(transport); /* Delete the ICMP proto unreachable timer if it's active. */ if (timer_delete(&transport->proto_unreach_timer)) sctp_transport_put(transport); sctp_transport_put(transport); } static void sctp_transport_destroy_rcu(struct rcu_head *head) { struct sctp_transport *transport; transport = container_of(head, struct sctp_transport, rcu); dst_release(transport->dst); kfree(transport); SCTP_DBG_OBJCNT_DEC(transport); } /* Destroy the transport data structure. * Assumes there are no more users of this structure. */ static void sctp_transport_destroy(struct sctp_transport *transport) { if (unlikely(refcount_read(&transport->refcnt))) { WARN(1, "Attempt to destroy undead transport %p!\n", transport); return; } sctp_packet_free(&transport->packet); if (transport->asoc) sctp_association_put(transport->asoc); call_rcu(&transport->rcu, sctp_transport_destroy_rcu); } /* Start T3_rtx timer if it is not already running and update the heartbeat * timer. This routine is called every time a DATA chunk is sent. */ void sctp_transport_reset_t3_rtx(struct sctp_transport *transport) { /* RFC 2960 6.3.2 Retransmission Timer Rules * * R1) Every time a DATA chunk is sent to any address(including a * retransmission), if the T3-rtx timer of that address is not running * start it running so that it will expire after the RTO of that * address. */ if (!timer_pending(&transport->T3_rtx_timer)) if (!mod_timer(&transport->T3_rtx_timer, jiffies + transport->rto)) sctp_transport_hold(transport); } void sctp_transport_reset_hb_timer(struct sctp_transport *transport) { unsigned long expires; /* When a data chunk is sent, reset the heartbeat interval. */ expires = jiffies + sctp_transport_timeout(transport); if (!mod_timer(&transport->hb_timer, expires + get_random_u32_below(transport->rto))) sctp_transport_hold(transport); } void sctp_transport_reset_reconf_timer(struct sctp_transport *transport) { if (!timer_pending(&transport->reconf_timer)) if (!mod_timer(&transport->reconf_timer, jiffies + transport->rto)) sctp_transport_hold(transport); } void sctp_transport_reset_probe_timer(struct sctp_transport *transport) { if (!mod_timer(&transport->probe_timer, jiffies + transport->probe_interval)) sctp_transport_hold(transport); } void sctp_transport_reset_raise_timer(struct sctp_transport *transport) { if (!mod_timer(&transport->probe_timer, jiffies + transport->probe_interval * 30)) sctp_transport_hold(transport); } /* This transport has been assigned to an association. * Initialize fields from the association or from the sock itself. * Register the reference count in the association. */ void sctp_transport_set_owner(struct sctp_transport *transport, struct sctp_association *asoc) { transport->asoc = asoc; sctp_association_hold(asoc); } /* Initialize the pmtu of a transport. */ void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk) { /* If we don't have a fresh route, look one up */ if (!transport->dst || READ_ONCE(transport->dst->obsolete)) { sctp_transport_dst_release(transport); transport->af_specific->get_dst(transport, &transport->saddr, &transport->fl, sk); } if (transport->param_flags & SPP_PMTUD_DISABLE) { struct sctp_association *asoc = transport->asoc; if (!transport->pathmtu && asoc && asoc->pathmtu) transport->pathmtu = asoc->pathmtu; if (transport->pathmtu) return; } if (transport->dst) transport->pathmtu = sctp_dst_mtu(transport->dst); else transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT; sctp_transport_pl_update(transport); } void sctp_transport_pl_send(struct sctp_transport *t) { if (t->pl.probe_count < SCTP_MAX_PROBES) goto out; t->pl.probe_count = 0; if (t->pl.state == SCTP_PL_BASE) { if (t->pl.probe_size == SCTP_BASE_PLPMTU) { /* BASE_PLPMTU Confirmation Failed */ t->pl.state = SCTP_PL_ERROR; /* Base -> Error */ t->pl.pmtu = SCTP_BASE_PLPMTU; t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t); sctp_assoc_sync_pmtu(t->asoc); } } else if (t->pl.state == SCTP_PL_SEARCH) { if (t->pl.pmtu == t->pl.probe_size) { /* Black Hole Detected */ t->pl.state = SCTP_PL_BASE; /* Search -> Base */ t->pl.probe_size = SCTP_BASE_PLPMTU; t->pl.probe_high = 0; t->pl.pmtu = SCTP_BASE_PLPMTU; t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t); sctp_assoc_sync_pmtu(t->asoc); } else { /* Normal probe failure. */ t->pl.probe_high = t->pl.probe_size; t->pl.probe_size = t->pl.pmtu; } } else if (t->pl.state == SCTP_PL_COMPLETE) { if (t->pl.pmtu == t->pl.probe_size) { /* Black Hole Detected */ t->pl.state = SCTP_PL_BASE; /* Search Complete -> Base */ t->pl.probe_size = SCTP_BASE_PLPMTU; t->pl.pmtu = SCTP_BASE_PLPMTU; t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t); sctp_assoc_sync_pmtu(t->asoc); } } out: pr_debug("%s: PLPMTUD: transport: %p, state: %d, pmtu: %d, size: %d, high: %d\n", __func__, t, t->pl.state, t->pl.pmtu, t->pl.probe_size, t->pl.probe_high); t->pl.probe_count++; } bool sctp_transport_pl_recv(struct sctp_transport *t) { pr_debug("%s: PLPMTUD: transport: %p, state: %d, pmtu: %d, size: %d, high: %d\n", __func__, t, t->pl.state, t->pl.pmtu, t->pl.probe_size, t->pl.probe_high); t->pl.pmtu = t->pl.probe_size; t->pl.probe_count = 0; if (t->pl.state == SCTP_PL_BASE) { t->pl.state = SCTP_PL_SEARCH; /* Base -> Search */ t->pl.probe_size += SCTP_PL_BIG_STEP; } else if (t->pl.state == SCTP_PL_ERROR) { t->pl.state = SCTP_PL_SEARCH; /* Error -> Search */ t->pl.pmtu = t->pl.probe_size; t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t); sctp_assoc_sync_pmtu(t->asoc); t->pl.probe_size += SCTP_PL_BIG_STEP; } else if (t->pl.state == SCTP_PL_SEARCH) { if (!t->pl.probe_high) { if (t->pl.probe_size < SCTP_MAX_PLPMTU) { t->pl.probe_size = min(t->pl.probe_size + SCTP_PL_BIG_STEP, SCTP_MAX_PLPMTU); return false; } t->pl.probe_high = SCTP_MAX_PLPMTU; } t->pl.probe_size += SCTP_PL_MIN_STEP; if (t->pl.probe_size >= t->pl.probe_high) { t->pl.probe_high = 0; t->pl.state = SCTP_PL_COMPLETE; /* Search -> Search Complete */ t->pl.probe_size = t->pl.pmtu; t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t); sctp_assoc_sync_pmtu(t->asoc); sctp_transport_reset_raise_timer(t); } } else if (t->pl.state == SCTP_PL_COMPLETE) { /* Raise probe_size again after 30 * interval in Search Complete */ t->pl.state = SCTP_PL_SEARCH; /* Search Complete -> Search */ t->pl.probe_size = min(t->pl.probe_size + SCTP_PL_MIN_STEP, SCTP_MAX_PLPMTU); } return t->pl.state == SCTP_PL_COMPLETE; } static bool sctp_transport_pl_toobig(struct sctp_transport *t, u32 pmtu) { pr_debug("%s: PLPMTUD: transport: %p, state: %d, pmtu: %d, size: %d, ptb: %d\n", __func__, t, t->pl.state, t->pl.pmtu, t->pl.probe_size, pmtu); if (pmtu < SCTP_MIN_PLPMTU || pmtu >= t->pl.probe_size) return false; if (t->pl.state == SCTP_PL_BASE) { if (pmtu >= SCTP_MIN_PLPMTU && pmtu < SCTP_BASE_PLPMTU) { t->pl.state = SCTP_PL_ERROR; /* Base -> Error */ t->pl.pmtu = SCTP_BASE_PLPMTU; t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t); return true; } } else if (t->pl.state == SCTP_PL_SEARCH) { if (pmtu >= SCTP_BASE_PLPMTU && pmtu < t->pl.pmtu) { t->pl.state = SCTP_PL_BASE; /* Search -> Base */ t->pl.probe_size = SCTP_BASE_PLPMTU; t->pl.probe_count = 0; t->pl.probe_high = 0; t->pl.pmtu = SCTP_BASE_PLPMTU; t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t); return true; } else if (pmtu > t->pl.pmtu && pmtu < t->pl.probe_size) { t->pl.probe_size = pmtu; t->pl.probe_count = 0; } } else if (t->pl.state == SCTP_PL_COMPLETE) { if (pmtu >= SCTP_BASE_PLPMTU && pmtu < t->pl.pmtu) { t->pl.state = SCTP_PL_BASE; /* Complete -> Base */ t->pl.probe_size = SCTP_BASE_PLPMTU; t->pl.probe_count = 0; t->pl.probe_high = 0; t->pl.pmtu = SCTP_BASE_PLPMTU; t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t); sctp_transport_reset_probe_timer(t); return true; } } return false; } bool sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu) { struct sock *sk = t->asoc->base.sk; struct dst_entry *dst; bool change = true; if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) { pr_warn_ratelimited("%s: Reported pmtu %d too low, using default minimum of %d\n", __func__, pmtu, SCTP_DEFAULT_MINSEGMENT); /* Use default minimum segment instead */ pmtu = SCTP_DEFAULT_MINSEGMENT; } pmtu = SCTP_TRUNC4(pmtu); if (sctp_transport_pl_enabled(t)) return sctp_transport_pl_toobig(t, pmtu - sctp_transport_pl_hlen(t)); dst = sctp_transport_dst_check(t); if (dst) { struct sctp_pf *pf = sctp_get_pf_specific(dst->ops->family); union sctp_addr addr; pf->af->from_sk(&addr, sk); pf->to_sk_daddr(&t->ipaddr, sk); dst->ops->update_pmtu(dst, sk, NULL, pmtu, true); pf->to_sk_daddr(&addr, sk); dst = sctp_transport_dst_check(t); } if (!dst) { t->af_specific->get_dst(t, &t->saddr, &t->fl, sk); dst = t->dst; } if (dst) { /* Re-fetch, as under layers may have a higher minimum size */ pmtu = sctp_dst_mtu(dst); change = t->pathmtu != pmtu; } t->pathmtu = pmtu; return change; } /* Caches the dst entry and source address for a transport's destination * address. */ void sctp_transport_route(struct sctp_transport *transport, union sctp_addr *saddr, struct sctp_sock *opt) { struct sctp_association *asoc = transport->asoc; struct sctp_af *af = transport->af_specific; sctp_transport_dst_release(transport); af->get_dst(transport, saddr, &transport->fl, sctp_opt2sk(opt)); if (saddr) memcpy(&transport->saddr, saddr, sizeof(union sctp_addr)); else af->get_saddr(opt, transport, &transport->fl); sctp_transport_pmtu(transport, sctp_opt2sk(opt)); /* Initialize sk->sk_rcv_saddr, if the transport is the * association's active path for getsockname(). */ if (transport->dst && asoc && (!asoc->peer.primary_path || transport == asoc->peer.active_path)) opt->pf->to_sk_saddr(&transport->saddr, asoc->base.sk); } /* Hold a reference to a transport. */ int sctp_transport_hold(struct sctp_transport *transport) { return refcount_inc_not_zero(&transport->refcnt); } /* Release a reference to a transport and clean up * if there are no more references. */ void sctp_transport_put(struct sctp_transport *transport) { if (refcount_dec_and_test(&transport->refcnt)) sctp_transport_destroy(transport); } /* Update transport's RTO based on the newly calculated RTT. */ void sctp_transport_update_rto(struct sctp_transport *tp, __u32 rtt) { if (unlikely(!tp->rto_pending)) /* We should not be doing any RTO updates unless rto_pending is set. */ pr_debug("%s: rto_pending not set on transport %p!\n", __func__, tp); if (tp->rttvar || tp->srtt) { struct net *net = tp->asoc->base.net; unsigned int rto_beta, rto_alpha; /* 6.3.1 C3) When a new RTT measurement R' is made, set * RTTVAR <- (1 - RTO.Beta) * RTTVAR + RTO.Beta * |SRTT - R'| * SRTT <- (1 - RTO.Alpha) * SRTT + RTO.Alpha * R' */ /* Note: The above algorithm has been rewritten to * express rto_beta and rto_alpha as inverse powers * of two. * For example, assuming the default value of RTO.Alpha of * 1/8, rto_alpha would be expressed as 3. */ rto_beta = READ_ONCE(net->sctp.rto_beta); if (rto_beta < 32) tp->rttvar = tp->rttvar - (tp->rttvar >> rto_beta) + (((__u32)abs((__s64)tp->srtt - (__s64)rtt)) >> rto_beta); rto_alpha = READ_ONCE(net->sctp.rto_alpha); if (rto_alpha < 32) tp->srtt = tp->srtt - (tp->srtt >> rto_alpha) + (rtt >> rto_alpha); } else { /* 6.3.1 C2) When the first RTT measurement R is made, set * SRTT <- R, RTTVAR <- R/2. */ tp->srtt = rtt; tp->rttvar = rtt >> 1; } /* 6.3.1 G1) Whenever RTTVAR is computed, if RTTVAR = 0, then * adjust RTTVAR <- G, where G is the CLOCK GRANULARITY. */ if (tp->rttvar == 0) tp->rttvar = SCTP_CLOCK_GRANULARITY; /* 6.3.1 C3) After the computation, update RTO <- SRTT + 4 * RTTVAR. */ tp->rto = tp->srtt + (tp->rttvar << 2); /* 6.3.1 C6) Whenever RTO is computed, if it is less than RTO.Min * seconds then it is rounded up to RTO.Min seconds. */ if (tp->rto < tp->asoc->rto_min) tp->rto = tp->asoc->rto_min; /* 6.3.1 C7) A maximum value may be placed on RTO provided it is * at least RTO.max seconds. */ if (tp->rto > tp->asoc->rto_max) tp->rto = tp->asoc->rto_max; sctp_max_rto(tp->asoc, tp); tp->rtt = rtt; /* Reset rto_pending so that a new RTT measurement is started when a * new data chunk is sent. */ tp->rto_pending = 0; pr_debug("%s: transport:%p, rtt:%d, srtt:%d rttvar:%d, rto:%ld\n", __func__, tp, rtt, tp->srtt, tp->rttvar, tp->rto); } /* This routine updates the transport's cwnd and partial_bytes_acked * parameters based on the bytes acked in the received SACK. */ void sctp_transport_raise_cwnd(struct sctp_transport *transport, __u32 sack_ctsn, __u32 bytes_acked) { struct sctp_association *asoc = transport->asoc; __u32 cwnd, ssthresh, flight_size, pba, pmtu; cwnd = transport->cwnd; flight_size = transport->flight_size; /* See if we need to exit Fast Recovery first */ if (asoc->fast_recovery && TSN_lte(asoc->fast_recovery_exit, sack_ctsn)) asoc->fast_recovery = 0; ssthresh = transport->ssthresh; pba = transport->partial_bytes_acked; pmtu = transport->asoc->pathmtu; if (cwnd <= ssthresh) { /* RFC 4960 7.2.1 * o When cwnd is less than or equal to ssthresh, an SCTP * endpoint MUST use the slow-start algorithm to increase * cwnd only if the current congestion window is being fully * utilized, an incoming SACK advances the Cumulative TSN * Ack Point, and the data sender is not in Fast Recovery. * Only when these three conditions are met can the cwnd be * increased; otherwise, the cwnd MUST not be increased. * If these conditions are met, then cwnd MUST be increased * by, at most, the lesser of 1) the total size of the * previously outstanding DATA chunk(s) acknowledged, and * 2) the destination's path MTU. This upper bound protects * against the ACK-Splitting attack outlined in [SAVAGE99]. */ if (asoc->fast_recovery) return; /* The appropriate cwnd increase algorithm is performed * if, and only if the congestion window is being fully * utilized. Note that RFC4960 Errata 3.22 removed the * other condition on ctsn moving. */ if (flight_size < cwnd) return; if (bytes_acked > pmtu) cwnd += pmtu; else cwnd += bytes_acked; pr_debug("%s: slow start: transport:%p, bytes_acked:%d, " "cwnd:%d, ssthresh:%d, flight_size:%d, pba:%d\n", __func__, transport, bytes_acked, cwnd, ssthresh, flight_size, pba); } else { /* RFC 2960 7.2.2 Whenever cwnd is greater than ssthresh, * upon each SACK arrival, increase partial_bytes_acked * by the total number of bytes of all new chunks * acknowledged in that SACK including chunks * acknowledged by the new Cumulative TSN Ack and by Gap * Ack Blocks. (updated by RFC4960 Errata 3.22) * * When partial_bytes_acked is greater than cwnd and * before the arrival of the SACK the sender had less * bytes of data outstanding than cwnd (i.e., before * arrival of the SACK, flightsize was less than cwnd), * reset partial_bytes_acked to cwnd. (RFC 4960 Errata * 3.26) * * When partial_bytes_acked is equal to or greater than * cwnd and before the arrival of the SACK the sender * had cwnd or more bytes of data outstanding (i.e., * before arrival of the SACK, flightsize was greater * than or equal to cwnd), partial_bytes_acked is reset * to (partial_bytes_acked - cwnd). Next, cwnd is * increased by MTU. (RFC 4960 Errata 3.12) */ pba += bytes_acked; if (pba > cwnd && flight_size < cwnd) pba = cwnd; if (pba >= cwnd && flight_size >= cwnd) { pba = pba - cwnd; cwnd += pmtu; } pr_debug("%s: congestion avoidance: transport:%p, " "bytes_acked:%d, cwnd:%d, ssthresh:%d, " "flight_size:%d, pba:%d\n", __func__, transport, bytes_acked, cwnd, ssthresh, flight_size, pba); } transport->cwnd = cwnd; transport->partial_bytes_acked = pba; } /* This routine is used to lower the transport's cwnd when congestion is * detected. */ void sctp_transport_lower_cwnd(struct sctp_transport *transport, enum sctp_lower_cwnd reason) { struct sctp_association *asoc = transport->asoc; switch (reason) { case SCTP_LOWER_CWND_T3_RTX: /* RFC 2960 Section 7.2.3, sctpimpguide * When the T3-rtx timer expires on an address, SCTP should * perform slow start by: * ssthresh = max(cwnd/2, 4*MTU) * cwnd = 1*MTU * partial_bytes_acked = 0 */ transport->ssthresh = max(transport->cwnd/2, 4*asoc->pathmtu); transport->cwnd = asoc->pathmtu; /* T3-rtx also clears fast recovery */ asoc->fast_recovery = 0; break; case SCTP_LOWER_CWND_FAST_RTX: /* RFC 2960 7.2.4 Adjust the ssthresh and cwnd of the * destination address(es) to which the missing DATA chunks * were last sent, according to the formula described in * Section 7.2.3. * * RFC 2960 7.2.3, sctpimpguide Upon detection of packet * losses from SACK (see Section 7.2.4), An endpoint * should do the following: * ssthresh = max(cwnd/2, 4*MTU) * cwnd = ssthresh * partial_bytes_acked = 0 */ if (asoc->fast_recovery) return; /* Mark Fast recovery */ asoc->fast_recovery = 1; asoc->fast_recovery_exit = asoc->next_tsn - 1; transport->ssthresh = max(transport->cwnd/2, 4*asoc->pathmtu); transport->cwnd = transport->ssthresh; break; case SCTP_LOWER_CWND_ECNE: /* RFC 2481 Section 6.1.2. * If the sender receives an ECN-Echo ACK packet * then the sender knows that congestion was encountered in the * network on the path from the sender to the receiver. The * indication of congestion should be treated just as a * congestion loss in non-ECN Capable TCP. That is, the TCP * source halves the congestion window "cwnd" and reduces the * slow start threshold "ssthresh". * A critical condition is that TCP does not react to * congestion indications more than once every window of * data (or more loosely more than once every round-trip time). */ if (time_after(jiffies, transport->last_time_ecne_reduced + transport->rtt)) { transport->ssthresh = max(transport->cwnd/2, 4*asoc->pathmtu); transport->cwnd = transport->ssthresh; transport->last_time_ecne_reduced = jiffies; } break; case SCTP_LOWER_CWND_INACTIVE: /* RFC 2960 Section 7.2.1, sctpimpguide * When the endpoint does not transmit data on a given * transport address, the cwnd of the transport address * should be adjusted to max(cwnd/2, 4*MTU) per RTO. * NOTE: Although the draft recommends that this check needs * to be done every RTO interval, we do it every hearbeat * interval. */ transport->cwnd = max(transport->cwnd/2, 4*asoc->pathmtu); /* RFC 4960 Errata 3.27.2: also adjust sshthresh */ transport->ssthresh = transport->cwnd; break; } transport->partial_bytes_acked = 0; pr_debug("%s: transport:%p, reason:%d, cwnd:%d, ssthresh:%d\n", __func__, transport, reason, transport->cwnd, transport->ssthresh); } /* Apply Max.Burst limit to the congestion window: * sctpimpguide-05 2.14.2 * D) When the time comes for the sender to * transmit new DATA chunks, the protocol parameter Max.Burst MUST * first be applied to limit how many new DATA chunks may be sent. * The limit is applied by adjusting cwnd as follows: * if ((flightsize+ Max.Burst * MTU) < cwnd) * cwnd = flightsize + Max.Burst * MTU */ void sctp_transport_burst_limited(struct sctp_transport *t) { struct sctp_association *asoc = t->asoc; u32 old_cwnd = t->cwnd; u32 max_burst_bytes; if (t->burst_limited || asoc->max_burst == 0) return; max_burst_bytes = t->flight_size + (asoc->max_burst * asoc->pathmtu); if (max_burst_bytes < old_cwnd) { t->cwnd = max_burst_bytes; t->burst_limited = old_cwnd; } } /* Restore the old cwnd congestion window, after the burst had it's * desired effect. */ void sctp_transport_burst_reset(struct sctp_transport *t) { if (t->burst_limited) { t->cwnd = t->burst_limited; t->burst_limited = 0; } } /* What is the next timeout value for this transport? */ unsigned long sctp_transport_timeout(struct sctp_transport *trans) { /* RTO + timer slack +/- 50% of RTO */ unsigned long timeout = trans->rto >> 1; if (trans->state != SCTP_UNCONFIRMED && trans->state != SCTP_PF) timeout += trans->hbinterval; return max_t(unsigned long, timeout, HZ / 5); } /* Reset transport variables to their initial values */ void sctp_transport_reset(struct sctp_transport *t) { struct sctp_association *asoc = t->asoc; /* RFC 2960 (bis), Section 5.2.4 * All the congestion control parameters (e.g., cwnd, ssthresh) * related to this peer MUST be reset to their initial values * (see Section 6.2.1) */ t->cwnd = min(4*asoc->pathmtu, max_t(__u32, 2*asoc->pathmtu, 4380)); t->burst_limited = 0; t->ssthresh = asoc->peer.i.a_rwnd; t->rto = asoc->rto_initial; sctp_max_rto(asoc, t); t->rtt = 0; t->srtt = 0; t->rttvar = 0; /* Reset these additional variables so that we have a clean slate. */ t->partial_bytes_acked = 0; t->flight_size = 0; t->error_count = 0; t->rto_pending = 0; t->hb_sent = 0; /* Initialize the state information for SFR-CACC */ t->cacc.changeover_active = 0; t->cacc.cycling_changeover = 0; t->cacc.next_tsn_at_change = 0; t->cacc.cacc_saw_newack = 0; } /* Schedule retransmission on the given transport */ void sctp_transport_immediate_rtx(struct sctp_transport *t) { /* Stop pending T3_rtx_timer */ if (timer_delete(&t->T3_rtx_timer)) sctp_transport_put(t); sctp_retransmit(&t->asoc->outqueue, t, SCTP_RTXR_T3_RTX); if (!timer_pending(&t->T3_rtx_timer)) { if (!mod_timer(&t->T3_rtx_timer, jiffies + t->rto)) sctp_transport_hold(t); } } /* Drop dst */ void sctp_transport_dst_release(struct sctp_transport *t) { dst_release(t->dst); t->dst = NULL; t->dst_pending_confirm = 0; } /* Schedule neighbour confirm */ void sctp_transport_dst_confirm(struct sctp_transport *t) { t->dst_pending_confirm = 1; }
14 14 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 // SPDX-License-Identifier: GPL-2.0 #include <linux/skbuff.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_ipv4.h> #include <net/netfilter/nf_tables_ipv6.h> #include <net/route.h> #include <net/ip.h> #ifdef CONFIG_NF_TABLES_IPV4 static unsigned int nf_route_table_hook4(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { const struct iphdr *iph; struct nft_pktinfo pkt; __be32 saddr, daddr; unsigned int ret; u32 mark; int err; u8 tos; nft_set_pktinfo(&pkt, skb, state); nft_set_pktinfo_ipv4(&pkt); mark = skb->mark; iph = ip_hdr(skb); saddr = iph->saddr; daddr = iph->daddr; tos = iph->tos; ret = nft_do_chain(&pkt, priv); if (ret == NF_ACCEPT) { iph = ip_hdr(skb); if (iph->saddr != saddr || iph->daddr != daddr || skb->mark != mark || iph->tos != tos) { err = ip_route_me_harder(state->net, state->sk, skb, RTN_UNSPEC); if (err < 0) ret = NF_DROP_ERR(err); } } return ret; } static const struct nft_chain_type nft_chain_route_ipv4 = { .name = "route", .type = NFT_CHAIN_T_ROUTE, .family = NFPROTO_IPV4, .hook_mask = (1 << NF_INET_LOCAL_OUT), .hooks = { [NF_INET_LOCAL_OUT] = nf_route_table_hook4, }, }; #endif #ifdef CONFIG_NF_TABLES_IPV6 static unsigned int nf_route_table_hook6(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct in6_addr saddr, daddr; struct nft_pktinfo pkt; u32 mark, flowlabel; unsigned int ret; u8 hop_limit; int err; nft_set_pktinfo(&pkt, skb, state); nft_set_pktinfo_ipv6(&pkt); /* save source/dest address, mark, hoplimit, flowlabel, priority */ memcpy(&saddr, &ipv6_hdr(skb)->saddr, sizeof(saddr)); memcpy(&daddr, &ipv6_hdr(skb)->daddr, sizeof(daddr)); mark = skb->mark; hop_limit = ipv6_hdr(skb)->hop_limit; /* flowlabel and prio (includes version, which shouldn't change either)*/ flowlabel = *((u32 *)ipv6_hdr(skb)); ret = nft_do_chain(&pkt, priv); if (ret == NF_ACCEPT && (memcmp(&ipv6_hdr(skb)->saddr, &saddr, sizeof(saddr)) || memcmp(&ipv6_hdr(skb)->daddr, &daddr, sizeof(daddr)) || skb->mark != mark || ipv6_hdr(skb)->hop_limit != hop_limit || flowlabel != *((u32 *)ipv6_hdr(skb)))) { err = nf_ip6_route_me_harder(state->net, state->sk, skb); if (err < 0) ret = NF_DROP_ERR(err); } return ret; } static const struct nft_chain_type nft_chain_route_ipv6 = { .name = "route", .type = NFT_CHAIN_T_ROUTE, .family = NFPROTO_IPV6, .hook_mask = (1 << NF_INET_LOCAL_OUT), .hooks = { [NF_INET_LOCAL_OUT] = nf_route_table_hook6, }, }; #endif #ifdef CONFIG_NF_TABLES_INET static unsigned int nf_route_table_inet(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nft_pktinfo pkt; switch (state->pf) { case NFPROTO_IPV4: return nf_route_table_hook4(priv, skb, state); case NFPROTO_IPV6: return nf_route_table_hook6(priv, skb, state); default: nft_set_pktinfo(&pkt, skb, state); break; } return nft_do_chain(&pkt, priv); } static const struct nft_chain_type nft_chain_route_inet = { .name = "route", .type = NFT_CHAIN_T_ROUTE, .family = NFPROTO_INET, .hook_mask = (1 << NF_INET_LOCAL_OUT), .hooks = { [NF_INET_LOCAL_OUT] = nf_route_table_inet, }, }; #endif void __init nft_chain_route_init(void) { #ifdef CONFIG_NF_TABLES_IPV6 nft_register_chain_type(&nft_chain_route_ipv6); #endif #ifdef CONFIG_NF_TABLES_IPV4 nft_register_chain_type(&nft_chain_route_ipv4); #endif #ifdef CONFIG_NF_TABLES_INET nft_register_chain_type(&nft_chain_route_inet); #endif } void __exit nft_chain_route_fini(void) { #ifdef CONFIG_NF_TABLES_IPV6 nft_unregister_chain_type(&nft_chain_route_ipv6); #endif #ifdef CONFIG_NF_TABLES_IPV4 nft_unregister_chain_type(&nft_chain_route_ipv4); #endif #ifdef CONFIG_NF_TABLES_INET nft_unregister_chain_type(&nft_chain_route_inet); #endif }
102 102 102 93 5 5 96 96 20 20 1 19 19 19 3 3 3 3 1 3 2 2 2 2 2 2 2 2 101 101 99 101 101 101 101 101 100 101 101 101 101 101 101 100 101 84 5 89 101 101 101 94 94 91 6 91 90 6 2 91 91 91 88 6 94 94 94 94 94 94 94 94 94 94 94 94 94 94 94 94 94 94 94 94 94 94 94 94 94 94 88 6 94 94 94 94 88 6 54 9 56 7 7 7 61 15 15 15 1 1 2 2 2 2 2 2 2 9 9 9 9 9 11 11 11 4 7 11 11 11 11 8 8 7 13 36 36 33 33 11 29 10 57 57 57 57 51 51 51 51 51 51 57 57 57 51 1241 1155 99 39 57 53 7 4 15 15 15 15 11 6 6 11 6 2 12 46 46 14 44 63 63 63 63 10 15 15 13 3 16 16 16 33 33 3 3 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 // SPDX-License-Identifier: GPL-2.0-only /* * This is the linux wireless configuration interface. * * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2015-2017 Intel Deutschland GmbH * Copyright (C) 2018-2025 Intel Corporation */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/if.h> #include <linux/module.h> #include <linux/err.h> #include <linux/list.h> #include <linux/slab.h> #include <linux/nl80211.h> #include <linux/debugfs.h> #include <linux/notifier.h> #include <linux/device.h> #include <linux/etherdevice.h> #include <linux/rtnetlink.h> #include <linux/sched.h> #include <net/genetlink.h> #include <net/cfg80211.h> #include "nl80211.h" #include "core.h" #include "sysfs.h" #include "debugfs.h" #include "wext-compat.h" #include "rdev-ops.h" /* name for sysfs, %d is appended */ #define PHY_NAME "phy" /* maximum length of radio debugfs directory name */ #define RADIO_DEBUGFSDIR_MAX_LEN 8 MODULE_AUTHOR("Johannes Berg"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("wireless configuration support"); MODULE_ALIAS_GENL_FAMILY(NL80211_GENL_NAME); /* RCU-protected (and RTNL for writers) */ LIST_HEAD(cfg80211_rdev_list); int cfg80211_rdev_list_generation; /* for debugfs */ static struct dentry *ieee80211_debugfs_dir; /* for the cleanup, scan and event works */ struct workqueue_struct *cfg80211_wq; static bool cfg80211_disable_40mhz_24ghz; module_param(cfg80211_disable_40mhz_24ghz, bool, 0644); MODULE_PARM_DESC(cfg80211_disable_40mhz_24ghz, "Disable 40MHz support in the 2.4GHz band"); struct cfg80211_registered_device *cfg80211_rdev_by_wiphy_idx(int wiphy_idx) { struct cfg80211_registered_device *result = NULL, *rdev; ASSERT_RTNL(); for_each_rdev(rdev) { if (rdev->wiphy_idx == wiphy_idx) { result = rdev; break; } } return result; } int get_wiphy_idx(struct wiphy *wiphy) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); return rdev->wiphy_idx; } struct wiphy *wiphy_idx_to_wiphy(int wiphy_idx) { struct cfg80211_registered_device *rdev; ASSERT_RTNL(); rdev = cfg80211_rdev_by_wiphy_idx(wiphy_idx); if (!rdev) return NULL; return &rdev->wiphy; } static int cfg80211_dev_check_name(struct cfg80211_registered_device *rdev, const char *newname) { struct cfg80211_registered_device *rdev2; int wiphy_idx, taken = -1, digits; ASSERT_RTNL(); if (strlen(newname) > NL80211_WIPHY_NAME_MAXLEN) return -EINVAL; /* prohibit calling the thing phy%d when %d is not its number */ sscanf(newname, PHY_NAME "%d%n", &wiphy_idx, &taken); if (taken == strlen(newname) && wiphy_idx != rdev->wiphy_idx) { /* count number of places needed to print wiphy_idx */ digits = 1; while (wiphy_idx /= 10) digits++; /* * deny the name if it is phy<idx> where <idx> is printed * without leading zeroes. taken == strlen(newname) here */ if (taken == strlen(PHY_NAME) + digits) return -EINVAL; } /* Ensure another device does not already have this name. */ for_each_rdev(rdev2) if (strcmp(newname, wiphy_name(&rdev2->wiphy)) == 0) return -EINVAL; return 0; } int cfg80211_dev_rename(struct cfg80211_registered_device *rdev, char *newname) { int result; ASSERT_RTNL(); lockdep_assert_wiphy(&rdev->wiphy); /* Ignore nop renames */ if (strcmp(newname, wiphy_name(&rdev->wiphy)) == 0) return 0; result = cfg80211_dev_check_name(rdev, newname); if (result < 0) return result; result = device_rename(&rdev->wiphy.dev, newname); if (result) return result; debugfs_change_name(rdev->wiphy.debugfsdir, "%s", newname); nl80211_notify_wiphy(rdev, NL80211_CMD_NEW_WIPHY); return 0; } int cfg80211_switch_netns(struct cfg80211_registered_device *rdev, struct net *net) { struct wireless_dev *wdev; int err = 0; if (!(rdev->wiphy.flags & WIPHY_FLAG_NETNS_OK)) return -EOPNOTSUPP; list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { if (!wdev->netdev) continue; wdev->netdev->netns_immutable = false; err = dev_change_net_namespace(wdev->netdev, net, "wlan%d"); if (err) break; wdev->netdev->netns_immutable = true; } if (err) { /* failed -- clean up to old netns */ net = wiphy_net(&rdev->wiphy); list_for_each_entry_continue_reverse(wdev, &rdev->wiphy.wdev_list, list) { if (!wdev->netdev) continue; wdev->netdev->netns_immutable = false; err = dev_change_net_namespace(wdev->netdev, net, "wlan%d"); WARN_ON(err); wdev->netdev->netns_immutable = true; } return err; } guard(wiphy)(&rdev->wiphy); list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { if (!wdev->netdev) continue; nl80211_notify_iface(rdev, wdev, NL80211_CMD_DEL_INTERFACE); } nl80211_notify_wiphy(rdev, NL80211_CMD_DEL_WIPHY); wiphy_net_set(&rdev->wiphy, net); err = device_rename(&rdev->wiphy.dev, dev_name(&rdev->wiphy.dev)); WARN_ON(err); nl80211_notify_wiphy(rdev, NL80211_CMD_NEW_WIPHY); list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { if (!wdev->netdev) continue; nl80211_notify_iface(rdev, wdev, NL80211_CMD_NEW_INTERFACE); } return 0; } static void cfg80211_rfkill_poll(struct rfkill *rfkill, void *data) { struct cfg80211_registered_device *rdev = data; guard(wiphy)(&rdev->wiphy); rdev_rfkill_poll(rdev); } void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { lockdep_assert_held(&rdev->wiphy.mtx); if (WARN_ON(wdev->iftype != NL80211_IFTYPE_P2P_DEVICE)) return; if (!wdev_running(wdev)) return; rdev_stop_p2p_device(rdev, wdev); wdev->is_running = false; rdev->opencount--; if (rdev->scan_req && rdev->scan_req->req.wdev == wdev) { if (WARN_ON(!rdev->scan_req->notified && (!rdev->int_scan_req || !rdev->int_scan_req->notified))) rdev->scan_req->info.aborted = true; ___cfg80211_scan_done(rdev, false); } } void cfg80211_stop_nan(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { lockdep_assert_held(&rdev->wiphy.mtx); if (WARN_ON(wdev->iftype != NL80211_IFTYPE_NAN)) return; if (!wdev_running(wdev)) return; rdev_stop_nan(rdev, wdev); wdev->is_running = false; rdev->opencount--; } void cfg80211_shutdown_all_interfaces(struct wiphy *wiphy) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct wireless_dev *wdev; ASSERT_RTNL(); list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { if (wdev->netdev) { dev_close(wdev->netdev); continue; } /* otherwise, check iftype */ guard(wiphy)(wiphy); switch (wdev->iftype) { case NL80211_IFTYPE_P2P_DEVICE: cfg80211_stop_p2p_device(rdev, wdev); break; case NL80211_IFTYPE_NAN: cfg80211_stop_nan(rdev, wdev); break; default: break; } } } EXPORT_SYMBOL_GPL(cfg80211_shutdown_all_interfaces); static int cfg80211_rfkill_set_block(void *data, bool blocked) { struct cfg80211_registered_device *rdev = data; if (!blocked) return 0; rtnl_lock(); cfg80211_shutdown_all_interfaces(&rdev->wiphy); rtnl_unlock(); return 0; } static void cfg80211_rfkill_block_work(struct work_struct *work) { struct cfg80211_registered_device *rdev; rdev = container_of(work, struct cfg80211_registered_device, rfkill_block); cfg80211_rfkill_set_block(rdev, true); } static void cfg80211_event_work(struct work_struct *work) { struct cfg80211_registered_device *rdev; rdev = container_of(work, struct cfg80211_registered_device, event_work); guard(wiphy)(&rdev->wiphy); cfg80211_process_rdev_events(rdev); } void cfg80211_destroy_ifaces(struct cfg80211_registered_device *rdev) { struct wireless_dev *wdev, *tmp; ASSERT_RTNL(); list_for_each_entry_safe(wdev, tmp, &rdev->wiphy.wdev_list, list) { if (wdev->nl_owner_dead) { if (wdev->netdev) dev_close(wdev->netdev); guard(wiphy)(&rdev->wiphy); cfg80211_leave(rdev, wdev); cfg80211_remove_virtual_intf(rdev, wdev); } } } static void cfg80211_destroy_iface_wk(struct work_struct *work) { struct cfg80211_registered_device *rdev; rdev = container_of(work, struct cfg80211_registered_device, destroy_work); rtnl_lock(); cfg80211_destroy_ifaces(rdev); rtnl_unlock(); } static void cfg80211_sched_scan_stop_wk(struct wiphy *wiphy, struct wiphy_work *work) { struct cfg80211_registered_device *rdev; struct cfg80211_sched_scan_request *req, *tmp; rdev = container_of(work, struct cfg80211_registered_device, sched_scan_stop_wk); list_for_each_entry_safe(req, tmp, &rdev->sched_scan_req_list, list) { if (req->nl_owner_dead) cfg80211_stop_sched_scan_req(rdev, req, false); } } static void cfg80211_propagate_radar_detect_wk(struct work_struct *work) { struct cfg80211_registered_device *rdev; rdev = container_of(work, struct cfg80211_registered_device, propagate_radar_detect_wk); rtnl_lock(); regulatory_propagate_dfs_state(&rdev->wiphy, &rdev->radar_chandef, NL80211_DFS_UNAVAILABLE, NL80211_RADAR_DETECTED); rtnl_unlock(); } static void cfg80211_propagate_cac_done_wk(struct work_struct *work) { struct cfg80211_registered_device *rdev; rdev = container_of(work, struct cfg80211_registered_device, propagate_cac_done_wk); rtnl_lock(); regulatory_propagate_dfs_state(&rdev->wiphy, &rdev->cac_done_chandef, NL80211_DFS_AVAILABLE, NL80211_RADAR_CAC_FINISHED); rtnl_unlock(); } static void cfg80211_wiphy_work(struct work_struct *work) { struct cfg80211_registered_device *rdev; struct wiphy_work *wk; rdev = container_of(work, struct cfg80211_registered_device, wiphy_work); trace_wiphy_work_worker_start(&rdev->wiphy); guard(wiphy)(&rdev->wiphy); if (rdev->suspended) return; spin_lock_irq(&rdev->wiphy_work_lock); wk = list_first_entry_or_null(&rdev->wiphy_work_list, struct wiphy_work, entry); if (wk) { list_del_init(&wk->entry); if (!list_empty(&rdev->wiphy_work_list)) queue_work(system_dfl_wq, work); spin_unlock_irq(&rdev->wiphy_work_lock); trace_wiphy_work_run(&rdev->wiphy, wk); wk->func(&rdev->wiphy, wk); } else { spin_unlock_irq(&rdev->wiphy_work_lock); } } /* exported functions */ struct wiphy *wiphy_new_nm(const struct cfg80211_ops *ops, int sizeof_priv, const char *requested_name) { static atomic_t wiphy_counter = ATOMIC_INIT(0); struct cfg80211_registered_device *rdev; int alloc_size; WARN_ON(ops->add_key && (!ops->del_key || !ops->set_default_key)); WARN_ON(ops->auth && (!ops->assoc || !ops->deauth || !ops->disassoc)); WARN_ON(ops->connect && !ops->disconnect); WARN_ON(ops->join_ibss && !ops->leave_ibss); WARN_ON(ops->add_virtual_intf && !ops->del_virtual_intf); WARN_ON(ops->add_station && !ops->del_station); WARN_ON(ops->add_mpath && !ops->del_mpath); WARN_ON(ops->join_mesh && !ops->leave_mesh); WARN_ON(ops->start_p2p_device && !ops->stop_p2p_device); WARN_ON(ops->start_ap && !ops->stop_ap); WARN_ON(ops->join_ocb && !ops->leave_ocb); WARN_ON(ops->suspend && !ops->resume); WARN_ON(ops->sched_scan_start && !ops->sched_scan_stop); WARN_ON(ops->remain_on_channel && !ops->cancel_remain_on_channel); WARN_ON(ops->tdls_channel_switch && !ops->tdls_cancel_channel_switch); WARN_ON(ops->add_tx_ts && !ops->del_tx_ts); alloc_size = sizeof(*rdev) + sizeof_priv; rdev = kzalloc(alloc_size, GFP_KERNEL); if (!rdev) return NULL; rdev->ops = ops; rdev->wiphy_idx = atomic_inc_return(&wiphy_counter); if (unlikely(rdev->wiphy_idx < 0)) { /* ugh, wrapped! */ atomic_dec(&wiphy_counter); kfree(rdev); return NULL; } /* atomic_inc_return makes it start at 1, make it start at 0 */ rdev->wiphy_idx--; /* give it a proper name */ if (requested_name && requested_name[0]) { int rv; rtnl_lock(); rv = cfg80211_dev_check_name(rdev, requested_name); if (rv < 0) { rtnl_unlock(); goto use_default_name; } rv = dev_set_name(&rdev->wiphy.dev, "%s", requested_name); rtnl_unlock(); if (rv) goto use_default_name; } else { int rv; use_default_name: /* NOTE: This is *probably* safe w/out holding rtnl because of * the restrictions on phy names. Probably this call could * fail if some other part of the kernel (re)named a device * phyX. But, might should add some locking and check return * value, and use a different name if this one exists? */ rv = dev_set_name(&rdev->wiphy.dev, PHY_NAME "%d", rdev->wiphy_idx); if (rv < 0) { kfree(rdev); return NULL; } } mutex_init(&rdev->wiphy.mtx); INIT_LIST_HEAD(&rdev->wiphy.wdev_list); INIT_LIST_HEAD(&rdev->beacon_registrations); spin_lock_init(&rdev->beacon_registrations_lock); spin_lock_init(&rdev->bss_lock); INIT_LIST_HEAD(&rdev->bss_list); INIT_LIST_HEAD(&rdev->sched_scan_req_list); wiphy_work_init(&rdev->scan_done_wk, __cfg80211_scan_done); INIT_DELAYED_WORK(&rdev->dfs_update_channels_wk, cfg80211_dfs_channels_update_work); #ifdef CONFIG_CFG80211_WEXT rdev->wiphy.wext = &cfg80211_wext_handler; #endif device_initialize(&rdev->wiphy.dev); rdev->wiphy.dev.class = &ieee80211_class; rdev->wiphy.dev.platform_data = rdev; device_enable_async_suspend(&rdev->wiphy.dev); INIT_WORK(&rdev->destroy_work, cfg80211_destroy_iface_wk); wiphy_work_init(&rdev->sched_scan_stop_wk, cfg80211_sched_scan_stop_wk); INIT_WORK(&rdev->sched_scan_res_wk, cfg80211_sched_scan_results_wk); INIT_WORK(&rdev->propagate_radar_detect_wk, cfg80211_propagate_radar_detect_wk); INIT_WORK(&rdev->propagate_cac_done_wk, cfg80211_propagate_cac_done_wk); INIT_WORK(&rdev->mgmt_registrations_update_wk, cfg80211_mgmt_registrations_update_wk); spin_lock_init(&rdev->mgmt_registrations_lock); INIT_WORK(&rdev->wiphy_work, cfg80211_wiphy_work); INIT_LIST_HEAD(&rdev->wiphy_work_list); spin_lock_init(&rdev->wiphy_work_lock); #ifdef CONFIG_CFG80211_DEFAULT_PS rdev->wiphy.flags |= WIPHY_FLAG_PS_ON_BY_DEFAULT; #endif wiphy_net_set(&rdev->wiphy, &init_net); rdev->rfkill_ops.set_block = cfg80211_rfkill_set_block; rdev->wiphy.rfkill = rfkill_alloc(dev_name(&rdev->wiphy.dev), &rdev->wiphy.dev, RFKILL_TYPE_WLAN, &rdev->rfkill_ops, rdev); if (!rdev->wiphy.rfkill) { wiphy_free(&rdev->wiphy); return NULL; } INIT_WORK(&rdev->rfkill_block, cfg80211_rfkill_block_work); INIT_WORK(&rdev->conn_work, cfg80211_conn_work); INIT_WORK(&rdev->event_work, cfg80211_event_work); INIT_WORK(&rdev->background_cac_abort_wk, cfg80211_background_cac_abort_wk); INIT_DELAYED_WORK(&rdev->background_cac_done_wk, cfg80211_background_cac_done_wk); init_waitqueue_head(&rdev->dev_wait); /* * Initialize wiphy parameters to IEEE 802.11 MIB default values. * Fragmentation and RTS threshold are disabled by default with the * special -1 value. */ rdev->wiphy.retry_short = 7; rdev->wiphy.retry_long = 4; rdev->wiphy.frag_threshold = (u32) -1; rdev->wiphy.rts_threshold = (u32) -1; rdev->wiphy.coverage_class = 0; rdev->wiphy.max_num_csa_counters = 1; rdev->wiphy.max_sched_scan_plans = 1; rdev->wiphy.max_sched_scan_plan_interval = U32_MAX; return &rdev->wiphy; } EXPORT_SYMBOL(wiphy_new_nm); static int wiphy_verify_iface_combinations(struct wiphy *wiphy, const struct ieee80211_iface_combination *iface_comb, int n_iface_comb, bool combined_radio) { const struct ieee80211_iface_combination *c; int i, j; for (i = 0; i < n_iface_comb; i++) { u32 cnt = 0; u16 all_iftypes = 0; c = &iface_comb[i]; /* * Combinations with just one interface aren't real, * however we make an exception for DFS. */ if (WARN_ON((c->max_interfaces < 2) && !c->radar_detect_widths)) return -EINVAL; /* Need at least one channel */ if (WARN_ON(!c->num_different_channels)) return -EINVAL; /* DFS only works on one channel. Avoid this check * for multi-radio global combination, since it hold * the capabilities of all radio combinations. */ if (!combined_radio && WARN_ON(c->radar_detect_widths && c->num_different_channels > 1)) return -EINVAL; if (WARN_ON(!c->n_limits)) return -EINVAL; for (j = 0; j < c->n_limits; j++) { u16 types = c->limits[j].types; /* interface types shouldn't overlap */ if (WARN_ON(types & all_iftypes)) return -EINVAL; all_iftypes |= types; if (WARN_ON(!c->limits[j].max)) return -EINVAL; /* Shouldn't list software iftypes in combinations! */ if (WARN_ON(wiphy->software_iftypes & types)) return -EINVAL; /* Only a single P2P_DEVICE can be allowed, avoid this * check for multi-radio global combination, since it * hold the capabilities of all radio combinations. */ if (!combined_radio && WARN_ON(types & BIT(NL80211_IFTYPE_P2P_DEVICE) && c->limits[j].max > 1)) return -EINVAL; /* Only a single NAN can be allowed, avoid this * check for multi-radio global combination, since it * hold the capabilities of all radio combinations. */ if (!combined_radio && WARN_ON(types & BIT(NL80211_IFTYPE_NAN) && c->limits[j].max > 1)) return -EINVAL; /* * This isn't well-defined right now. If you have an * IBSS interface, then its beacon interval may change * by joining other networks, and nothing prevents it * from doing that. * So technically we probably shouldn't even allow AP * and IBSS in the same interface, but it seems that * some drivers support that, possibly only with fixed * beacon intervals for IBSS. */ if (WARN_ON(types & BIT(NL80211_IFTYPE_ADHOC) && c->beacon_int_min_gcd)) { return -EINVAL; } cnt += c->limits[j].max; /* * Don't advertise an unsupported type * in a combination. */ if (WARN_ON((wiphy->interface_modes & types) != types)) return -EINVAL; } if (WARN_ON(all_iftypes & BIT(NL80211_IFTYPE_WDS))) return -EINVAL; /* You can't even choose that many! */ if (WARN_ON(cnt < c->max_interfaces)) return -EINVAL; } return 0; } static int wiphy_verify_combinations(struct wiphy *wiphy) { int i, ret; bool combined_radio = false; if (wiphy->n_radio) { for (i = 0; i < wiphy->n_radio; i++) { const struct wiphy_radio *radio = &wiphy->radio[i]; ret = wiphy_verify_iface_combinations(wiphy, radio->iface_combinations, radio->n_iface_combinations, false); if (ret) return ret; } combined_radio = true; } ret = wiphy_verify_iface_combinations(wiphy, wiphy->iface_combinations, wiphy->n_iface_combinations, combined_radio); return ret; } int wiphy_register(struct wiphy *wiphy) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); int res; enum nl80211_band band; struct ieee80211_supported_band *sband; bool have_band = false; int i; u16 ifmodes = wiphy->interface_modes; #ifdef CONFIG_PM if (WARN_ON(wiphy->wowlan && (wiphy->wowlan->flags & WIPHY_WOWLAN_GTK_REKEY_FAILURE) && !(wiphy->wowlan->flags & WIPHY_WOWLAN_SUPPORTS_GTK_REKEY))) return -EINVAL; if (WARN_ON(wiphy->wowlan && !wiphy->wowlan->flags && !wiphy->wowlan->n_patterns && !wiphy->wowlan->tcp)) return -EINVAL; #endif if (WARN_ON((wiphy->features & NL80211_FEATURE_TDLS_CHANNEL_SWITCH) && (!rdev->ops->tdls_channel_switch || !rdev->ops->tdls_cancel_channel_switch))) return -EINVAL; if (WARN_ON((wiphy->interface_modes & BIT(NL80211_IFTYPE_NAN)) && (!rdev->ops->start_nan || !rdev->ops->stop_nan || !rdev->ops->add_nan_func || !rdev->ops->del_nan_func || !(wiphy->nan_supported_bands & BIT(NL80211_BAND_2GHZ))))) return -EINVAL; if (WARN_ON(wiphy->interface_modes & BIT(NL80211_IFTYPE_WDS))) return -EINVAL; if (WARN_ON(wiphy->pmsr_capa && !wiphy->pmsr_capa->ftm.supported)) return -EINVAL; if (wiphy->pmsr_capa && wiphy->pmsr_capa->ftm.supported) { if (WARN_ON(!wiphy->pmsr_capa->ftm.asap && !wiphy->pmsr_capa->ftm.non_asap)) return -EINVAL; if (WARN_ON(!wiphy->pmsr_capa->ftm.preambles || !wiphy->pmsr_capa->ftm.bandwidths)) return -EINVAL; if (WARN_ON(wiphy->pmsr_capa->ftm.preambles & ~(BIT(NL80211_PREAMBLE_LEGACY) | BIT(NL80211_PREAMBLE_HT) | BIT(NL80211_PREAMBLE_VHT) | BIT(NL80211_PREAMBLE_HE) | BIT(NL80211_PREAMBLE_DMG)))) return -EINVAL; if (WARN_ON((wiphy->pmsr_capa->ftm.trigger_based || wiphy->pmsr_capa->ftm.non_trigger_based) && !(wiphy->pmsr_capa->ftm.preambles & BIT(NL80211_PREAMBLE_HE)))) return -EINVAL; if (WARN_ON(wiphy->pmsr_capa->ftm.bandwidths & ~(BIT(NL80211_CHAN_WIDTH_20_NOHT) | BIT(NL80211_CHAN_WIDTH_20) | BIT(NL80211_CHAN_WIDTH_40) | BIT(NL80211_CHAN_WIDTH_80) | BIT(NL80211_CHAN_WIDTH_80P80) | BIT(NL80211_CHAN_WIDTH_160) | BIT(NL80211_CHAN_WIDTH_320) | BIT(NL80211_CHAN_WIDTH_5) | BIT(NL80211_CHAN_WIDTH_10)))) return -EINVAL; } if (WARN_ON((wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED) && (wiphy->regulatory_flags & (REGULATORY_CUSTOM_REG | REGULATORY_STRICT_REG | REGULATORY_COUNTRY_IE_FOLLOW_POWER | REGULATORY_COUNTRY_IE_IGNORE)))) return -EINVAL; if (WARN_ON(wiphy->coalesce && (!wiphy->coalesce->n_rules || !wiphy->coalesce->n_patterns) && (!wiphy->coalesce->pattern_min_len || wiphy->coalesce->pattern_min_len > wiphy->coalesce->pattern_max_len))) return -EINVAL; if (WARN_ON(wiphy->ap_sme_capa && !(wiphy->flags & WIPHY_FLAG_HAVE_AP_SME))) return -EINVAL; if (WARN_ON(wiphy->addresses && !wiphy->n_addresses)) return -EINVAL; if (WARN_ON(wiphy->addresses && !is_zero_ether_addr(wiphy->perm_addr) && memcmp(wiphy->perm_addr, wiphy->addresses[0].addr, ETH_ALEN))) return -EINVAL; if (WARN_ON(wiphy->max_acl_mac_addrs && (!(wiphy->flags & WIPHY_FLAG_HAVE_AP_SME) || !rdev->ops->set_mac_acl))) return -EINVAL; /* assure only valid behaviours are flagged by driver * hence subtract 2 as bit 0 is invalid. */ if (WARN_ON(wiphy->bss_select_support && (wiphy->bss_select_support & ~(BIT(__NL80211_BSS_SELECT_ATTR_AFTER_LAST) - 2)))) return -EINVAL; if (WARN_ON(wiphy_ext_feature_isset(&rdev->wiphy, NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_1X) && (!rdev->ops->set_pmk || !rdev->ops->del_pmk))) return -EINVAL; if (WARN_ON(!(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_FW_ROAM) && rdev->ops->update_connect_params)) return -EINVAL; if (wiphy->addresses) memcpy(wiphy->perm_addr, wiphy->addresses[0].addr, ETH_ALEN); /* sanity check ifmodes */ WARN_ON(!ifmodes); ifmodes &= ((1 << NUM_NL80211_IFTYPES) - 1) & ~1; if (WARN_ON(ifmodes != wiphy->interface_modes)) wiphy->interface_modes = ifmodes; res = wiphy_verify_combinations(wiphy); if (res) return res; /* sanity check supported bands/channels */ for (band = 0; band < NUM_NL80211_BANDS; band++) { const struct ieee80211_sband_iftype_data *iftd; u16 types = 0; bool have_he = false; sband = wiphy->bands[band]; if (!sband) continue; sband->band = band; if (WARN_ON(!sband->n_channels)) return -EINVAL; /* * on 60GHz or sub-1Ghz band, there are no legacy rates, so * n_bitrates is 0 */ if (WARN_ON((band != NL80211_BAND_60GHZ && band != NL80211_BAND_S1GHZ) && !sband->n_bitrates)) return -EINVAL; if (WARN_ON(band == NL80211_BAND_6GHZ && (sband->ht_cap.ht_supported || sband->vht_cap.vht_supported))) return -EINVAL; /* * Since cfg80211_disable_40mhz_24ghz is global, we can * modify the sband's ht data even if the driver uses a * global structure for that. */ if (cfg80211_disable_40mhz_24ghz && band == NL80211_BAND_2GHZ && sband->ht_cap.ht_supported) { sband->ht_cap.cap &= ~IEEE80211_HT_CAP_SUP_WIDTH_20_40; sband->ht_cap.cap &= ~IEEE80211_HT_CAP_SGI_40; } /* * Since we use a u32 for rate bitmaps in * ieee80211_get_response_rate, we cannot * have more than 32 legacy rates. */ if (WARN_ON(sband->n_bitrates > 32)) return -EINVAL; for (i = 0; i < sband->n_channels; i++) { sband->channels[i].orig_flags = sband->channels[i].flags; sband->channels[i].orig_mag = INT_MAX; sband->channels[i].orig_mpwr = sband->channels[i].max_power; sband->channels[i].band = band; if (WARN_ON(sband->channels[i].freq_offset >= 1000)) return -EINVAL; } for_each_sband_iftype_data(sband, i, iftd) { bool has_ap, has_non_ap; u32 ap_bits = BIT(NL80211_IFTYPE_AP) | BIT(NL80211_IFTYPE_P2P_GO); if (WARN_ON(!iftd->types_mask)) return -EINVAL; if (WARN_ON(types & iftd->types_mask)) return -EINVAL; /* at least one piece of information must be present */ if (WARN_ON(!iftd->he_cap.has_he)) return -EINVAL; types |= iftd->types_mask; if (i == 0) have_he = iftd->he_cap.has_he; else have_he = have_he && iftd->he_cap.has_he; has_ap = iftd->types_mask & ap_bits; has_non_ap = iftd->types_mask & ~ap_bits; /* * For EHT 20 MHz STA, the capabilities format differs * but to simplify, don't check 20 MHz but rather check * only if AP and non-AP were mentioned at the same time, * reject if so. */ if (WARN_ON(iftd->eht_cap.has_eht && has_ap && has_non_ap)) return -EINVAL; } if (WARN_ON(!have_he && band == NL80211_BAND_6GHZ)) return -EINVAL; have_band = true; } if (!have_band) { WARN_ON(1); return -EINVAL; } for (i = 0; i < rdev->wiphy.n_vendor_commands; i++) { /* * Validate we have a policy (can be explicitly set to * VENDOR_CMD_RAW_DATA which is non-NULL) and also that * we have at least one of doit/dumpit. */ if (WARN_ON(!rdev->wiphy.vendor_commands[i].policy)) return -EINVAL; if (WARN_ON(!rdev->wiphy.vendor_commands[i].doit && !rdev->wiphy.vendor_commands[i].dumpit)) return -EINVAL; } #ifdef CONFIG_PM if (WARN_ON(rdev->wiphy.wowlan && rdev->wiphy.wowlan->n_patterns && (!rdev->wiphy.wowlan->pattern_min_len || rdev->wiphy.wowlan->pattern_min_len > rdev->wiphy.wowlan->pattern_max_len))) return -EINVAL; #endif if (!wiphy->max_num_akm_suites) wiphy->max_num_akm_suites = NL80211_MAX_NR_AKM_SUITES; else if (wiphy->max_num_akm_suites < NL80211_MAX_NR_AKM_SUITES || wiphy->max_num_akm_suites > CFG80211_MAX_NUM_AKM_SUITES) return -EINVAL; /* Allocate radio configuration space for multi-radio wiphy */ if (wiphy->n_radio > 0) { int idx; wiphy->radio_cfg = kcalloc(wiphy->n_radio, sizeof(*wiphy->radio_cfg), GFP_KERNEL); if (!wiphy->radio_cfg) return -ENOMEM; /* * Initialize wiphy radio parameters to IEEE 802.11 * MIB default values. RTS threshold is disabled by * default with the special -1 value. */ for (idx = 0; idx < wiphy->n_radio; idx++) wiphy->radio_cfg[idx].rts_threshold = (u32)-1; } /* check and set up bitrates */ ieee80211_set_bitrate_flags(wiphy); rdev->wiphy.features |= NL80211_FEATURE_SCAN_FLUSH; if (rdev->wiphy.bss_param_support & WIPHY_BSS_PARAM_P2P_CTWINDOW) rdev->wiphy.features |= NL80211_FEATURE_P2P_GO_CTWIN; else if (rdev->wiphy.features & NL80211_FEATURE_P2P_GO_CTWIN) rdev->wiphy.bss_param_support |= WIPHY_BSS_PARAM_P2P_CTWINDOW; if (rdev->wiphy.bss_param_support & WIPHY_BSS_PARAM_P2P_OPPPS) rdev->wiphy.features |= NL80211_FEATURE_P2P_GO_OPPPS; else if (rdev->wiphy.features & NL80211_FEATURE_P2P_GO_OPPPS) rdev->wiphy.bss_param_support |= WIPHY_BSS_PARAM_P2P_OPPPS; rtnl_lock(); wiphy_lock(&rdev->wiphy); res = device_add(&rdev->wiphy.dev); if (res) { wiphy_unlock(&rdev->wiphy); rtnl_unlock(); return res; } list_add_rcu(&rdev->list, &cfg80211_rdev_list); cfg80211_rdev_list_generation++; /* add to debugfs */ rdev->wiphy.debugfsdir = debugfs_create_dir(wiphy_name(&rdev->wiphy), ieee80211_debugfs_dir); if (wiphy->n_radio > 0) { int idx; char radio_name[RADIO_DEBUGFSDIR_MAX_LEN]; for (idx = 0; idx < wiphy->n_radio; idx++) { scnprintf(radio_name, sizeof(radio_name), "radio%d", idx); wiphy->radio_cfg[idx].radio_debugfsdir = debugfs_create_dir(radio_name, rdev->wiphy.debugfsdir); } } cfg80211_debugfs_rdev_add(rdev); nl80211_notify_wiphy(rdev, NL80211_CMD_NEW_WIPHY); wiphy_unlock(&rdev->wiphy); /* set up regulatory info */ wiphy_regulatory_register(wiphy); if (wiphy->regulatory_flags & REGULATORY_CUSTOM_REG) { struct regulatory_request request = { .wiphy_idx = get_wiphy_idx(wiphy), .initiator = NL80211_REGDOM_SET_BY_DRIVER, .alpha2[0] = '9', .alpha2[1] = '9', }; nl80211_send_reg_change_event(&request); } /* Check that nobody globally advertises any capabilities they do not * advertise on all possible interface types. */ if (wiphy->extended_capabilities_len && wiphy->num_iftype_ext_capab && wiphy->iftype_ext_capab) { u8 supported_on_all, j; const struct wiphy_iftype_ext_capab *capab; capab = wiphy->iftype_ext_capab; for (j = 0; j < wiphy->extended_capabilities_len; j++) { if (capab[0].extended_capabilities_len > j) supported_on_all = capab[0].extended_capabilities[j]; else supported_on_all = 0x00; for (i = 1; i < wiphy->num_iftype_ext_capab; i++) { if (j >= capab[i].extended_capabilities_len) { supported_on_all = 0x00; break; } supported_on_all &= capab[i].extended_capabilities[j]; } if (WARN_ON(wiphy->extended_capabilities[j] & ~supported_on_all)) break; } } rdev->wiphy.registered = true; rtnl_unlock(); res = rfkill_register(rdev->wiphy.rfkill); if (res) { rfkill_destroy(rdev->wiphy.rfkill); rdev->wiphy.rfkill = NULL; wiphy_unregister(&rdev->wiphy); return res; } return 0; } EXPORT_SYMBOL(wiphy_register); void wiphy_rfkill_start_polling(struct wiphy *wiphy) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); if (!rdev->ops->rfkill_poll) return; rdev->rfkill_ops.poll = cfg80211_rfkill_poll; rfkill_resume_polling(wiphy->rfkill); } EXPORT_SYMBOL(wiphy_rfkill_start_polling); void cfg80211_process_wiphy_works(struct cfg80211_registered_device *rdev, struct wiphy_work *end) { unsigned int runaway_limit = 100; unsigned long flags; lockdep_assert_held(&rdev->wiphy.mtx); spin_lock_irqsave(&rdev->wiphy_work_lock, flags); while (!list_empty(&rdev->wiphy_work_list)) { struct wiphy_work *wk; wk = list_first_entry(&rdev->wiphy_work_list, struct wiphy_work, entry); list_del_init(&wk->entry); spin_unlock_irqrestore(&rdev->wiphy_work_lock, flags); trace_wiphy_work_run(&rdev->wiphy, wk); wk->func(&rdev->wiphy, wk); spin_lock_irqsave(&rdev->wiphy_work_lock, flags); if (wk == end) break; if (WARN_ON(--runaway_limit == 0)) INIT_LIST_HEAD(&rdev->wiphy_work_list); } spin_unlock_irqrestore(&rdev->wiphy_work_lock, flags); } void wiphy_unregister(struct wiphy *wiphy) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); wait_event(rdev->dev_wait, ({ int __count; wiphy_lock(&rdev->wiphy); __count = rdev->opencount; wiphy_unlock(&rdev->wiphy); __count == 0; })); if (rdev->wiphy.rfkill) rfkill_unregister(rdev->wiphy.rfkill); rtnl_lock(); wiphy_lock(&rdev->wiphy); nl80211_notify_wiphy(rdev, NL80211_CMD_DEL_WIPHY); rdev->wiphy.registered = false; WARN_ON(!list_empty(&rdev->wiphy.wdev_list)); /* * First remove the hardware from everywhere, this makes * it impossible to find from userspace. */ debugfs_remove_recursive(rdev->wiphy.debugfsdir); list_del_rcu(&rdev->list); synchronize_rcu(); /* * If this device got a regulatory hint tell core its * free to listen now to a new shiny device regulatory hint */ wiphy_regulatory_deregister(wiphy); cfg80211_rdev_list_generation++; device_del(&rdev->wiphy.dev); #ifdef CONFIG_PM if (rdev->wiphy.wowlan_config && rdev->ops->set_wakeup) rdev_set_wakeup(rdev, false); #endif /* surely nothing is reachable now, clean up work */ cfg80211_process_wiphy_works(rdev, NULL); wiphy_unlock(&rdev->wiphy); rtnl_unlock(); /* this has nothing to do now but make sure it's gone */ cancel_work_sync(&rdev->wiphy_work); cancel_work_sync(&rdev->conn_work); flush_work(&rdev->event_work); cancel_delayed_work_sync(&rdev->dfs_update_channels_wk); cancel_delayed_work_sync(&rdev->background_cac_done_wk); flush_work(&rdev->destroy_work); flush_work(&rdev->propagate_radar_detect_wk); flush_work(&rdev->propagate_cac_done_wk); flush_work(&rdev->mgmt_registrations_update_wk); flush_work(&rdev->background_cac_abort_wk); cfg80211_rdev_free_wowlan(rdev); cfg80211_free_coalesce(rdev->coalesce); rdev->coalesce = NULL; } EXPORT_SYMBOL(wiphy_unregister); void cfg80211_dev_free(struct cfg80211_registered_device *rdev) { struct cfg80211_internal_bss *scan, *tmp; struct cfg80211_beacon_registration *reg, *treg; unsigned long flags; spin_lock_irqsave(&rdev->wiphy_work_lock, flags); WARN_ON(!list_empty(&rdev->wiphy_work_list)); spin_unlock_irqrestore(&rdev->wiphy_work_lock, flags); cancel_work_sync(&rdev->wiphy_work); rfkill_destroy(rdev->wiphy.rfkill); list_for_each_entry_safe(reg, treg, &rdev->beacon_registrations, list) { list_del(&reg->list); kfree(reg); } list_for_each_entry_safe(scan, tmp, &rdev->bss_list, list) cfg80211_put_bss(&rdev->wiphy, &scan->pub); mutex_destroy(&rdev->wiphy.mtx); /* * The 'regd' can only be non-NULL if we never finished * initializing the wiphy and thus never went through the * unregister path - e.g. in failure scenarios. Thus, it * cannot have been visible to anyone if non-NULL, so we * can just free it here. */ kfree(rcu_dereference_raw(rdev->wiphy.regd)); kfree(rdev); } void wiphy_free(struct wiphy *wiphy) { kfree(wiphy->radio_cfg); put_device(&wiphy->dev); } EXPORT_SYMBOL(wiphy_free); void wiphy_rfkill_set_hw_state_reason(struct wiphy *wiphy, bool blocked, enum rfkill_hard_block_reasons reason) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); if (rfkill_set_hw_state_reason(wiphy->rfkill, blocked, reason)) schedule_work(&rdev->rfkill_block); } EXPORT_SYMBOL(wiphy_rfkill_set_hw_state_reason); static void _cfg80211_unregister_wdev(struct wireless_dev *wdev, bool unregister_netdev) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_cqm_config *cqm_config; unsigned int link_id; ASSERT_RTNL(); lockdep_assert_held(&rdev->wiphy.mtx); nl80211_notify_iface(rdev, wdev, NL80211_CMD_DEL_INTERFACE); wdev->registered = false; if (wdev->netdev) { sysfs_remove_link(&wdev->netdev->dev.kobj, "phy80211"); if (unregister_netdev) unregister_netdevice(wdev->netdev); } list_del_rcu(&wdev->list); synchronize_net(); rdev->devlist_generation++; cfg80211_mlme_purge_registrations(wdev); switch (wdev->iftype) { case NL80211_IFTYPE_P2P_DEVICE: cfg80211_stop_p2p_device(rdev, wdev); break; case NL80211_IFTYPE_NAN: cfg80211_stop_nan(rdev, wdev); break; default: break; } #ifdef CONFIG_CFG80211_WEXT kfree_sensitive(wdev->wext.keys); wdev->wext.keys = NULL; #endif wiphy_work_cancel(wdev->wiphy, &wdev->cqm_rssi_work); /* deleted from the list, so can't be found from nl80211 any more */ cqm_config = rcu_access_pointer(wdev->cqm_config); kfree_rcu(cqm_config, rcu_head); RCU_INIT_POINTER(wdev->cqm_config, NULL); /* * Ensure that all events have been processed and * freed. */ cfg80211_process_wdev_events(wdev); if (wdev->iftype == NL80211_IFTYPE_STATION || wdev->iftype == NL80211_IFTYPE_P2P_CLIENT) { for (link_id = 0; link_id < ARRAY_SIZE(wdev->links); link_id++) { struct cfg80211_internal_bss *curbss; curbss = wdev->links[link_id].client.current_bss; if (WARN_ON(curbss)) { cfg80211_unhold_bss(curbss); cfg80211_put_bss(wdev->wiphy, &curbss->pub); wdev->links[link_id].client.current_bss = NULL; } } } wdev->connected = false; } void cfg80211_unregister_wdev(struct wireless_dev *wdev) { _cfg80211_unregister_wdev(wdev, true); } EXPORT_SYMBOL(cfg80211_unregister_wdev); static const struct device_type wiphy_type = { .name = "wlan", }; void cfg80211_update_iface_num(struct cfg80211_registered_device *rdev, enum nl80211_iftype iftype, int num) { lockdep_assert_held(&rdev->wiphy.mtx); rdev->num_running_ifaces += num; if (iftype == NL80211_IFTYPE_MONITOR) rdev->num_running_monitor_ifaces += num; } void cfg80211_leave(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { struct net_device *dev = wdev->netdev; struct cfg80211_sched_scan_request *pos, *tmp; lockdep_assert_held(&rdev->wiphy.mtx); cfg80211_pmsr_wdev_down(wdev); cfg80211_stop_radar_detection(wdev); cfg80211_stop_background_radar_detection(wdev); switch (wdev->iftype) { case NL80211_IFTYPE_ADHOC: cfg80211_leave_ibss(rdev, dev, true); break; case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_STATION: list_for_each_entry_safe(pos, tmp, &rdev->sched_scan_req_list, list) { if (dev == pos->dev) cfg80211_stop_sched_scan_req(rdev, pos, false); } #ifdef CONFIG_CFG80211_WEXT kfree(wdev->wext.ie); wdev->wext.ie = NULL; wdev->wext.ie_len = 0; wdev->wext.connect.auth_type = NL80211_AUTHTYPE_AUTOMATIC; #endif cfg80211_disconnect(rdev, dev, WLAN_REASON_DEAUTH_LEAVING, true); break; case NL80211_IFTYPE_MESH_POINT: cfg80211_leave_mesh(rdev, dev); break; case NL80211_IFTYPE_AP: case NL80211_IFTYPE_P2P_GO: cfg80211_stop_ap(rdev, dev, -1, true); break; case NL80211_IFTYPE_OCB: cfg80211_leave_ocb(rdev, dev); break; case NL80211_IFTYPE_P2P_DEVICE: case NL80211_IFTYPE_NAN: /* cannot happen, has no netdev */ break; case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_MONITOR: /* nothing to do */ break; case NL80211_IFTYPE_UNSPECIFIED: case NL80211_IFTYPE_WDS: case NUM_NL80211_IFTYPES: /* invalid */ break; } } void cfg80211_stop_iface(struct wiphy *wiphy, struct wireless_dev *wdev, gfp_t gfp) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct cfg80211_event *ev; unsigned long flags; trace_cfg80211_stop_iface(wiphy, wdev); ev = kzalloc(sizeof(*ev), gfp); if (!ev) return; ev->type = EVENT_STOPPED; spin_lock_irqsave(&wdev->event_lock, flags); list_add_tail(&ev->list, &wdev->event_list); spin_unlock_irqrestore(&wdev->event_lock, flags); queue_work(cfg80211_wq, &rdev->event_work); } EXPORT_SYMBOL(cfg80211_stop_iface); void cfg80211_init_wdev(struct wireless_dev *wdev) { INIT_LIST_HEAD(&wdev->event_list); spin_lock_init(&wdev->event_lock); INIT_LIST_HEAD(&wdev->mgmt_registrations); INIT_LIST_HEAD(&wdev->pmsr_list); spin_lock_init(&wdev->pmsr_lock); INIT_WORK(&wdev->pmsr_free_wk, cfg80211_pmsr_free_wk); #ifdef CONFIG_CFG80211_WEXT wdev->wext.default_key = -1; wdev->wext.default_mgmt_key = -1; wdev->wext.connect.auth_type = NL80211_AUTHTYPE_AUTOMATIC; #endif wiphy_work_init(&wdev->cqm_rssi_work, cfg80211_cqm_rssi_notify_work); if (wdev->wiphy->flags & WIPHY_FLAG_PS_ON_BY_DEFAULT) wdev->ps = true; else wdev->ps = false; /* allow mac80211 to determine the timeout */ wdev->ps_timeout = -1; wdev->radio_mask = BIT(wdev->wiphy->n_radio) - 1; if ((wdev->iftype == NL80211_IFTYPE_STATION || wdev->iftype == NL80211_IFTYPE_P2P_CLIENT || wdev->iftype == NL80211_IFTYPE_ADHOC) && !wdev->use_4addr) wdev->netdev->priv_flags |= IFF_DONT_BRIDGE; INIT_WORK(&wdev->disconnect_wk, cfg80211_autodisconnect_wk); } void cfg80211_register_wdev(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { ASSERT_RTNL(); lockdep_assert_held(&rdev->wiphy.mtx); /* * We get here also when the interface changes network namespaces, * as it's registered into the new one, but we don't want it to * change ID in that case. Checking if the ID is already assigned * works, because 0 isn't considered a valid ID and the memory is * 0-initialized. */ if (!wdev->identifier) wdev->identifier = ++rdev->wdev_id; list_add_rcu(&wdev->list, &rdev->wiphy.wdev_list); rdev->devlist_generation++; wdev->registered = true; if (wdev->netdev && sysfs_create_link(&wdev->netdev->dev.kobj, &rdev->wiphy.dev.kobj, "phy80211")) pr_err("failed to add phy80211 symlink to netdev!\n"); nl80211_notify_iface(rdev, wdev, NL80211_CMD_NEW_INTERFACE); } int cfg80211_register_netdevice(struct net_device *dev) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev; int ret; ASSERT_RTNL(); if (WARN_ON(!wdev)) return -EINVAL; rdev = wiphy_to_rdev(wdev->wiphy); lockdep_assert_held(&rdev->wiphy.mtx); /* we'll take care of this */ wdev->registered = true; wdev->registering = true; ret = register_netdevice(dev); if (ret) goto out; cfg80211_register_wdev(rdev, wdev); ret = 0; out: wdev->registering = false; if (ret) wdev->registered = false; return ret; } EXPORT_SYMBOL(cfg80211_register_netdevice); static int cfg80211_netdev_notifier_call(struct notifier_block *nb, unsigned long state, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev; struct cfg80211_sched_scan_request *pos, *tmp; if (!wdev) return NOTIFY_DONE; rdev = wiphy_to_rdev(wdev->wiphy); WARN_ON(wdev->iftype == NL80211_IFTYPE_UNSPECIFIED); switch (state) { case NETDEV_POST_INIT: SET_NETDEV_DEVTYPE(dev, &wiphy_type); wdev->netdev = dev; /* can only change netns with wiphy */ dev->netns_immutable = true; cfg80211_init_wdev(wdev); break; case NETDEV_REGISTER: if (!wdev->registered) { guard(wiphy)(&rdev->wiphy); cfg80211_register_wdev(rdev, wdev); } break; case NETDEV_UNREGISTER: /* * It is possible to get NETDEV_UNREGISTER multiple times, * so check wdev->registered. */ if (wdev->registered && !wdev->registering) { guard(wiphy)(&rdev->wiphy); _cfg80211_unregister_wdev(wdev, false); } break; case NETDEV_GOING_DOWN: scoped_guard(wiphy, &rdev->wiphy) { cfg80211_leave(rdev, wdev); cfg80211_remove_links(wdev); } /* since we just did cfg80211_leave() nothing to do there */ cancel_work_sync(&wdev->disconnect_wk); cancel_work_sync(&wdev->pmsr_free_wk); break; case NETDEV_DOWN: wiphy_lock(&rdev->wiphy); cfg80211_update_iface_num(rdev, wdev->iftype, -1); if (rdev->scan_req && rdev->scan_req->req.wdev == wdev) { if (WARN_ON(!rdev->scan_req->notified && (!rdev->int_scan_req || !rdev->int_scan_req->notified))) rdev->scan_req->info.aborted = true; ___cfg80211_scan_done(rdev, false); } list_for_each_entry_safe(pos, tmp, &rdev->sched_scan_req_list, list) { if (WARN_ON(pos->dev == wdev->netdev)) cfg80211_stop_sched_scan_req(rdev, pos, false); } rdev->opencount--; wiphy_unlock(&rdev->wiphy); wake_up(&rdev->dev_wait); break; case NETDEV_UP: wiphy_lock(&rdev->wiphy); cfg80211_update_iface_num(rdev, wdev->iftype, 1); switch (wdev->iftype) { #ifdef CONFIG_CFG80211_WEXT case NL80211_IFTYPE_ADHOC: cfg80211_ibss_wext_join(rdev, wdev); break; case NL80211_IFTYPE_STATION: cfg80211_mgd_wext_connect(rdev, wdev); break; #endif #ifdef CONFIG_MAC80211_MESH case NL80211_IFTYPE_MESH_POINT: { /* backward compat code... */ struct mesh_setup setup; memcpy(&setup, &default_mesh_setup, sizeof(setup)); /* back compat only needed for mesh_id */ setup.mesh_id = wdev->u.mesh.id; setup.mesh_id_len = wdev->u.mesh.id_up_len; if (wdev->u.mesh.id_up_len) __cfg80211_join_mesh(rdev, dev, &setup, &default_mesh_config); break; } #endif default: break; } rdev->opencount++; /* * Configure power management to the driver here so that its * correctly set also after interface type changes etc. */ if ((wdev->iftype == NL80211_IFTYPE_STATION || wdev->iftype == NL80211_IFTYPE_P2P_CLIENT) && rdev->ops->set_power_mgmt && rdev_set_power_mgmt(rdev, dev, wdev->ps, wdev->ps_timeout)) { /* assume this means it's off */ wdev->ps = false; } wiphy_unlock(&rdev->wiphy); break; case NETDEV_PRE_UP: if (!cfg80211_iftype_allowed(wdev->wiphy, wdev->iftype, wdev->use_4addr, 0)) return notifier_from_errno(-EOPNOTSUPP); if (rfkill_blocked(rdev->wiphy.rfkill)) return notifier_from_errno(-ERFKILL); break; default: return NOTIFY_DONE; } wireless_nlevent_flush(); return NOTIFY_OK; } static struct notifier_block cfg80211_netdev_notifier = { .notifier_call = cfg80211_netdev_notifier_call, }; static void __net_exit cfg80211_pernet_exit(struct net *net) { struct cfg80211_registered_device *rdev; rtnl_lock(); for_each_rdev(rdev) { if (net_eq(wiphy_net(&rdev->wiphy), net)) WARN_ON(cfg80211_switch_netns(rdev, &init_net)); } rtnl_unlock(); } static struct pernet_operations cfg80211_pernet_ops = { .exit = cfg80211_pernet_exit, }; void wiphy_work_queue(struct wiphy *wiphy, struct wiphy_work *work) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); unsigned long flags; trace_wiphy_work_queue(wiphy, work); spin_lock_irqsave(&rdev->wiphy_work_lock, flags); if (list_empty(&work->entry)) list_add_tail(&work->entry, &rdev->wiphy_work_list); spin_unlock_irqrestore(&rdev->wiphy_work_lock, flags); queue_work(system_dfl_wq, &rdev->wiphy_work); } EXPORT_SYMBOL_GPL(wiphy_work_queue); void wiphy_work_cancel(struct wiphy *wiphy, struct wiphy_work *work) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); unsigned long flags; lockdep_assert_held(&wiphy->mtx); trace_wiphy_work_cancel(wiphy, work); spin_lock_irqsave(&rdev->wiphy_work_lock, flags); if (!list_empty(&work->entry)) list_del_init(&work->entry); spin_unlock_irqrestore(&rdev->wiphy_work_lock, flags); } EXPORT_SYMBOL_GPL(wiphy_work_cancel); void wiphy_work_flush(struct wiphy *wiphy, struct wiphy_work *work) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); unsigned long flags; bool run; trace_wiphy_work_flush(wiphy, work); spin_lock_irqsave(&rdev->wiphy_work_lock, flags); run = !work || !list_empty(&work->entry); spin_unlock_irqrestore(&rdev->wiphy_work_lock, flags); if (run) cfg80211_process_wiphy_works(rdev, work); } EXPORT_SYMBOL_GPL(wiphy_work_flush); void wiphy_delayed_work_timer(struct timer_list *t) { struct wiphy_delayed_work *dwork = timer_container_of(dwork, t, timer); wiphy_work_queue(dwork->wiphy, &dwork->work); } EXPORT_SYMBOL(wiphy_delayed_work_timer); void wiphy_delayed_work_queue(struct wiphy *wiphy, struct wiphy_delayed_work *dwork, unsigned long delay) { trace_wiphy_delayed_work_queue(wiphy, &dwork->work, delay); if (!delay) { timer_delete(&dwork->timer); wiphy_work_queue(wiphy, &dwork->work); return; } dwork->wiphy = wiphy; mod_timer(&dwork->timer, jiffies + delay); } EXPORT_SYMBOL_GPL(wiphy_delayed_work_queue); void wiphy_delayed_work_cancel(struct wiphy *wiphy, struct wiphy_delayed_work *dwork) { lockdep_assert_held(&wiphy->mtx); timer_delete_sync(&dwork->timer); wiphy_work_cancel(wiphy, &dwork->work); } EXPORT_SYMBOL_GPL(wiphy_delayed_work_cancel); void wiphy_delayed_work_flush(struct wiphy *wiphy, struct wiphy_delayed_work *dwork) { lockdep_assert_held(&wiphy->mtx); timer_delete_sync(&dwork->timer); wiphy_work_flush(wiphy, &dwork->work); } EXPORT_SYMBOL_GPL(wiphy_delayed_work_flush); bool wiphy_delayed_work_pending(struct wiphy *wiphy, struct wiphy_delayed_work *dwork) { return timer_pending(&dwork->timer); } EXPORT_SYMBOL_GPL(wiphy_delayed_work_pending); enum hrtimer_restart wiphy_hrtimer_work_timer(struct hrtimer *t) { struct wiphy_hrtimer_work *hrwork = container_of(t, struct wiphy_hrtimer_work, timer); wiphy_work_queue(hrwork->wiphy, &hrwork->work); return HRTIMER_NORESTART; } EXPORT_SYMBOL_GPL(wiphy_hrtimer_work_timer); void wiphy_hrtimer_work_queue(struct wiphy *wiphy, struct wiphy_hrtimer_work *hrwork, ktime_t delay) { trace_wiphy_hrtimer_work_queue(wiphy, &hrwork->work, delay); if (!delay) { hrtimer_cancel(&hrwork->timer); wiphy_work_queue(wiphy, &hrwork->work); return; } hrwork->wiphy = wiphy; hrtimer_start_range_ns(&hrwork->timer, delay, 1000 * NSEC_PER_USEC, HRTIMER_MODE_REL); } EXPORT_SYMBOL_GPL(wiphy_hrtimer_work_queue); void wiphy_hrtimer_work_cancel(struct wiphy *wiphy, struct wiphy_hrtimer_work *hrwork) { lockdep_assert_held(&wiphy->mtx); hrtimer_cancel(&hrwork->timer); wiphy_work_cancel(wiphy, &hrwork->work); } EXPORT_SYMBOL_GPL(wiphy_hrtimer_work_cancel); void wiphy_hrtimer_work_flush(struct wiphy *wiphy, struct wiphy_hrtimer_work *hrwork) { lockdep_assert_held(&wiphy->mtx); hrtimer_cancel(&hrwork->timer); wiphy_work_flush(wiphy, &hrwork->work); } EXPORT_SYMBOL_GPL(wiphy_hrtimer_work_flush); bool wiphy_hrtimer_work_pending(struct wiphy *wiphy, struct wiphy_hrtimer_work *hrwork) { return hrtimer_is_queued(&hrwork->timer); } EXPORT_SYMBOL_GPL(wiphy_hrtimer_work_pending); static int __init cfg80211_init(void) { int err; err = register_pernet_device(&cfg80211_pernet_ops); if (err) goto out_fail_pernet; err = wiphy_sysfs_init(); if (err) goto out_fail_sysfs; err = register_netdevice_notifier(&cfg80211_netdev_notifier); if (err) goto out_fail_notifier; err = nl80211_init(); if (err) goto out_fail_nl80211; ieee80211_debugfs_dir = debugfs_create_dir("ieee80211", NULL); err = regulatory_init(); if (err) goto out_fail_reg; cfg80211_wq = alloc_ordered_workqueue("cfg80211", WQ_MEM_RECLAIM); if (!cfg80211_wq) { err = -ENOMEM; goto out_fail_wq; } return 0; out_fail_wq: regulatory_exit(); out_fail_reg: debugfs_remove(ieee80211_debugfs_dir); nl80211_exit(); out_fail_nl80211: unregister_netdevice_notifier(&cfg80211_netdev_notifier); out_fail_notifier: wiphy_sysfs_exit(); out_fail_sysfs: unregister_pernet_device(&cfg80211_pernet_ops); out_fail_pernet: return err; } fs_initcall(cfg80211_init); static void __exit cfg80211_exit(void) { debugfs_remove(ieee80211_debugfs_dir); nl80211_exit(); unregister_netdevice_notifier(&cfg80211_netdev_notifier); wiphy_sysfs_exit(); regulatory_exit(); unregister_pernet_device(&cfg80211_pernet_ops); destroy_workqueue(cfg80211_wq); } module_exit(cfg80211_exit);
4 1 4 4 1 1 99 99 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 // SPDX-License-Identifier: GPL-2.0-only /* * Aug 8, 2011 Bob Pearson with help from Joakim Tjernlund and George Spelvin * cleaned up code to current version of sparse and added the slicing-by-8 * algorithm to the closely similar existing slicing-by-4 algorithm. * * Oct 15, 2000 Matt Domsch <Matt_Domsch@dell.com> * Nicer crc32 functions/docs submitted by linux@horizon.com. Thanks! * Code was from the public domain, copyright abandoned. Code was * subsequently included in the kernel, thus was re-licensed under the * GNU GPL v2. * * Oct 12, 2000 Matt Domsch <Matt_Domsch@dell.com> * Same crc32 function was used in 5 other places in the kernel. * I made one version, and deleted the others. * There are various incantations of crc32(). Some use a seed of 0 or ~0. * Some xor at the end with ~0. The generic crc32() function takes * seed as an argument, and doesn't xor at the end. Then individual * users can do whatever they need. * drivers/net/smc9194.c uses seed ~0, doesn't xor with ~0. * fs/jffs2 uses seed 0, doesn't xor with ~0. * fs/partitions/efi.c uses seed ~0, xor's with ~0. */ /* see: Documentation/staging/crc32.rst for a description of algorithms */ #include <linux/crc32.h> #include <linux/export.h> #include <linux/module.h> #include <linux/types.h> #include "crc32table.h" static inline u32 __maybe_unused crc32_le_base(u32 crc, const u8 *p, size_t len) { while (len--) crc = (crc >> 8) ^ crc32table_le[(crc & 255) ^ *p++]; return crc; } static inline u32 __maybe_unused crc32_be_base(u32 crc, const u8 *p, size_t len) { while (len--) crc = (crc << 8) ^ crc32table_be[(crc >> 24) ^ *p++]; return crc; } static inline u32 __maybe_unused crc32c_base(u32 crc, const u8 *p, size_t len) { while (len--) crc = (crc >> 8) ^ crc32ctable_le[(crc & 255) ^ *p++]; return crc; } #ifdef CONFIG_CRC32_ARCH #include "crc32.h" /* $(SRCARCH)/crc32.h */ u32 crc32_optimizations(void) { return crc32_optimizations_arch(); } EXPORT_SYMBOL(crc32_optimizations); #else #define crc32_le_arch crc32_le_base #define crc32_be_arch crc32_be_base #define crc32c_arch crc32c_base #endif u32 crc32_le(u32 crc, const void *p, size_t len) { return crc32_le_arch(crc, p, len); } EXPORT_SYMBOL(crc32_le); u32 crc32_be(u32 crc, const void *p, size_t len) { return crc32_be_arch(crc, p, len); } EXPORT_SYMBOL(crc32_be); u32 crc32c(u32 crc, const void *p, size_t len) { return crc32c_arch(crc, p, len); } EXPORT_SYMBOL(crc32c); #ifdef crc32_mod_init_arch static int __init crc32_mod_init(void) { crc32_mod_init_arch(); return 0; } subsys_initcall(crc32_mod_init); static void __exit crc32_mod_exit(void) { } module_exit(crc32_mod_exit); #endif MODULE_DESCRIPTION("CRC32 library functions"); MODULE_LICENSE("GPL");
13 13 13 11 438 2708 7 7 104 1 46 63 63 266 268 268 117 77 79 93 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SCHED_SIGNAL_H #define _LINUX_SCHED_SIGNAL_H #include <linux/rculist.h> #include <linux/signal.h> #include <linux/sched.h> #include <linux/sched/jobctl.h> #include <linux/sched/task.h> #include <linux/cred.h> #include <linux/refcount.h> #include <linux/pid.h> #include <linux/posix-timers.h> #include <linux/mm_types.h> #include <asm/ptrace.h> /* * Types defining task->signal and task->sighand and APIs using them: */ struct sighand_struct { spinlock_t siglock; refcount_t count; wait_queue_head_t signalfd_wqh; struct k_sigaction action[_NSIG]; }; /* * Per-process accounting stats: */ struct pacct_struct { int ac_flag; long ac_exitcode; unsigned long ac_mem; u64 ac_utime, ac_stime; unsigned long ac_minflt, ac_majflt; }; struct cpu_itimer { u64 expires; u64 incr; }; /* * This is the atomic variant of task_cputime, which can be used for * storing and updating task_cputime statistics without locking. */ struct task_cputime_atomic { atomic64_t utime; atomic64_t stime; atomic64_t sum_exec_runtime; }; #define INIT_CPUTIME_ATOMIC \ (struct task_cputime_atomic) { \ .utime = ATOMIC64_INIT(0), \ .stime = ATOMIC64_INIT(0), \ .sum_exec_runtime = ATOMIC64_INIT(0), \ } /** * struct thread_group_cputimer - thread group interval timer counts * @cputime_atomic: atomic thread group interval timers. * * This structure contains the version of task_cputime, above, that is * used for thread group CPU timer calculations. */ struct thread_group_cputimer { struct task_cputime_atomic cputime_atomic; }; struct multiprocess_signals { sigset_t signal; struct hlist_node node; }; struct core_thread { struct task_struct *task; struct core_thread *next; }; struct core_state { atomic_t nr_threads; struct core_thread dumper; struct completion startup; }; /* * NOTE! "signal_struct" does not have its own * locking, because a shared signal_struct always * implies a shared sighand_struct, so locking * sighand_struct is always a proper superset of * the locking of signal_struct. */ struct signal_struct { refcount_t sigcnt; atomic_t live; int nr_threads; int quick_threads; struct list_head thread_head; wait_queue_head_t wait_chldexit; /* for wait4() */ /* current thread group signal load-balancing target: */ struct task_struct *curr_target; /* shared signal handling: */ struct sigpending shared_pending; /* For collecting multiprocess signals during fork */ struct hlist_head multiprocess; /* thread group exit support */ int group_exit_code; /* notify group_exec_task when notify_count is less or equal to 0 */ int notify_count; struct task_struct *group_exec_task; /* thread group stop support, overloads group_exit_code too */ int group_stop_count; unsigned int flags; /* see SIGNAL_* flags below */ struct core_state *core_state; /* coredumping support */ /* * PR_SET_CHILD_SUBREAPER marks a process, like a service * manager, to re-parent orphan (double-forking) child processes * to this process instead of 'init'. The service manager is * able to receive SIGCHLD signals and is able to investigate * the process until it calls wait(). All children of this * process will inherit a flag if they should look for a * child_subreaper process at exit. */ unsigned int is_child_subreaper:1; unsigned int has_child_subreaper:1; #ifdef CONFIG_POSIX_TIMERS /* POSIX.1b Interval Timers */ unsigned int timer_create_restore_ids:1; atomic_t next_posix_timer_id; struct hlist_head posix_timers; struct hlist_head ignored_posix_timers; /* ITIMER_REAL timer for the process */ struct hrtimer real_timer; ktime_t it_real_incr; /* * ITIMER_PROF and ITIMER_VIRTUAL timers for the process, we use * CPUCLOCK_PROF and CPUCLOCK_VIRT for indexing array as these * values are defined to 0 and 1 respectively */ struct cpu_itimer it[2]; /* * Thread group totals for process CPU timers. * See thread_group_cputimer(), et al, for details. */ struct thread_group_cputimer cputimer; #endif /* Empty if CONFIG_POSIX_TIMERS=n */ struct posix_cputimers posix_cputimers; /* PID/PID hash table linkage. */ struct pid *pids[PIDTYPE_MAX]; #ifdef CONFIG_NO_HZ_FULL atomic_t tick_dep_mask; #endif struct pid *tty_old_pgrp; /* boolean value for session group leader */ int leader; struct tty_struct *tty; /* NULL if no tty */ #ifdef CONFIG_SCHED_AUTOGROUP struct autogroup *autogroup; #endif /* * Cumulative resource counters for dead threads in the group, * and for reaped dead child processes forked by this group. * Live threads maintain their own counters and add to these * in __exit_signal, except for the group leader. */ seqlock_t stats_lock; u64 utime, stime, cutime, cstime; u64 gtime; u64 cgtime; struct prev_cputime prev_cputime; unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; unsigned long inblock, oublock, cinblock, coublock; unsigned long maxrss, cmaxrss; struct task_io_accounting ioac; /* * Cumulative ns of schedule CPU time fo dead threads in the * group, not including a zombie group leader, (This only differs * from jiffies_to_ns(utime + stime) if sched_clock uses something * other than jiffies.) */ unsigned long long sum_sched_runtime; /* * We don't bother to synchronize most readers of this at all, * because there is no reader checking a limit that actually needs * to get both rlim_cur and rlim_max atomically, and either one * alone is a single word that can safely be read normally. * getrlimit/setrlimit use task_lock(current->group_leader) to * protect this instead of the siglock, because they really * have no need to disable irqs. */ struct rlimit rlim[RLIM_NLIMITS]; #ifdef CONFIG_BSD_PROCESS_ACCT struct pacct_struct pacct; /* per-process accounting information */ #endif #ifdef CONFIG_TASKSTATS struct taskstats *stats; #endif #ifdef CONFIG_AUDIT unsigned audit_tty; struct tty_audit_buf *tty_audit_buf; #endif #ifdef CONFIG_CGROUPS struct rw_semaphore cgroup_threadgroup_rwsem; #endif /* * Thread is the potential origin of an oom condition; kill first on * oom */ bool oom_flag_origin; short oom_score_adj; /* OOM kill score adjustment */ short oom_score_adj_min; /* OOM kill score adjustment min value. * Only settable by CAP_SYS_RESOURCE. */ struct mm_struct *oom_mm; /* recorded mm when the thread group got * killed by the oom killer */ struct mutex cred_guard_mutex; /* guard against foreign influences on * credential calculations * (notably. ptrace) * Deprecated do not use in new code. * Use exec_update_lock instead. */ struct rw_semaphore exec_update_lock; /* Held while task_struct is * being updated during exec, * and may have inconsistent * permissions. */ } __randomize_layout; /* * Bits in flags field of signal_struct. */ #define SIGNAL_STOP_STOPPED 0x00000001 /* job control stop in effect */ #define SIGNAL_STOP_CONTINUED 0x00000002 /* SIGCONT since WCONTINUED reap */ #define SIGNAL_GROUP_EXIT 0x00000004 /* group exit in progress */ /* * Pending notifications to parent. */ #define SIGNAL_CLD_STOPPED 0x00000010 #define SIGNAL_CLD_CONTINUED 0x00000020 #define SIGNAL_CLD_MASK (SIGNAL_CLD_STOPPED|SIGNAL_CLD_CONTINUED) #define SIGNAL_UNKILLABLE 0x00000040 /* for init: ignore fatal signals */ #define SIGNAL_STOP_MASK (SIGNAL_CLD_MASK | SIGNAL_STOP_STOPPED | \ SIGNAL_STOP_CONTINUED) static inline void signal_set_stop_flags(struct signal_struct *sig, unsigned int flags) { WARN_ON(sig->flags & SIGNAL_GROUP_EXIT); sig->flags = (sig->flags & ~SIGNAL_STOP_MASK) | flags; } extern void flush_signals(struct task_struct *); extern void ignore_signals(struct task_struct *); extern void flush_signal_handlers(struct task_struct *, int force_default); extern int dequeue_signal(sigset_t *mask, kernel_siginfo_t *info, enum pid_type *type); static inline int kernel_dequeue_signal(void) { struct task_struct *task = current; kernel_siginfo_t __info; enum pid_type __type; int ret; spin_lock_irq(&task->sighand->siglock); ret = dequeue_signal(&task->blocked, &__info, &__type); spin_unlock_irq(&task->sighand->siglock); return ret; } static inline void kernel_signal_stop(void) { spin_lock_irq(&current->sighand->siglock); if (current->jobctl & JOBCTL_STOP_DEQUEUED) { current->jobctl |= JOBCTL_STOPPED; set_special_state(TASK_STOPPED); } spin_unlock_irq(&current->sighand->siglock); schedule(); } int force_sig_fault_to_task(int sig, int code, void __user *addr, struct task_struct *t); int force_sig_fault(int sig, int code, void __user *addr); int send_sig_fault(int sig, int code, void __user *addr, struct task_struct *t); int force_sig_mceerr(int code, void __user *, short); int send_sig_mceerr(int code, void __user *, short, struct task_struct *); int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper); int force_sig_pkuerr(void __user *addr, u32 pkey); int send_sig_perf(void __user *addr, u32 type, u64 sig_data); int force_sig_ptrace_errno_trap(int errno, void __user *addr); int force_sig_fault_trapno(int sig, int code, void __user *addr, int trapno); int send_sig_fault_trapno(int sig, int code, void __user *addr, int trapno, struct task_struct *t); int force_sig_seccomp(int syscall, int reason, bool force_coredump); extern int send_sig_info(int, struct kernel_siginfo *, struct task_struct *); extern void force_sigsegv(int sig); extern int force_sig_info(struct kernel_siginfo *); extern int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp); extern int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid); extern int kill_pid_usb_asyncio(int sig, int errno, sigval_t addr, struct pid *, const struct cred *); extern int kill_pgrp(struct pid *pid, int sig, int priv); extern int kill_pid(struct pid *pid, int sig, int priv); extern __must_check bool do_notify_parent(struct task_struct *, int); extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent); extern void force_sig(int); extern void force_fatal_sig(int); extern void force_exit_sig(int); extern int send_sig(int, struct task_struct *, int); extern int zap_other_threads(struct task_struct *p); extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *); static inline void clear_notify_signal(void) { clear_thread_flag(TIF_NOTIFY_SIGNAL); smp_mb__after_atomic(); } /* * Returns 'true' if kick_process() is needed to force a transition from * user -> kernel to guarantee expedient run of TWA_SIGNAL based task_work. */ static inline bool __set_notify_signal(struct task_struct *task) { return !test_and_set_tsk_thread_flag(task, TIF_NOTIFY_SIGNAL) && !wake_up_state(task, TASK_INTERRUPTIBLE); } /* * Called to break out of interruptible wait loops, and enter the * exit_to_user_mode_loop(). */ static inline void set_notify_signal(struct task_struct *task) { if (__set_notify_signal(task)) kick_process(task); } static inline int restart_syscall(void) { set_tsk_thread_flag(current, TIF_SIGPENDING); return -ERESTARTNOINTR; } static inline int task_sigpending(struct task_struct *p) { return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING)); } static inline int signal_pending(struct task_struct *p) { /* * TIF_NOTIFY_SIGNAL isn't really a signal, but it requires the same * behavior in terms of ensuring that we break out of wait loops * so that notify signal callbacks can be processed. */ if (unlikely(test_tsk_thread_flag(p, TIF_NOTIFY_SIGNAL))) return 1; return task_sigpending(p); } static inline int __fatal_signal_pending(struct task_struct *p) { return unlikely(sigismember(&p->pending.signal, SIGKILL)); } static inline int fatal_signal_pending(struct task_struct *p) { return task_sigpending(p) && __fatal_signal_pending(p); } static inline int signal_pending_state(unsigned int state, struct task_struct *p) { if (!(state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL))) return 0; if (!signal_pending(p)) return 0; return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p); } /* * This should only be used in fault handlers to decide whether we * should stop the current fault routine to handle the signals * instead, especially with the case where we've got interrupted with * a VM_FAULT_RETRY. */ static inline bool fault_signal_pending(vm_fault_t fault_flags, struct pt_regs *regs) { return unlikely((fault_flags & VM_FAULT_RETRY) && (fatal_signal_pending(current) || (user_mode(regs) && signal_pending(current)))); } /* * Reevaluate whether the task has signals pending delivery. * Wake the task if so. * This is required every time the blocked sigset_t changes. * callers must hold sighand->siglock. */ extern void recalc_sigpending(void); extern void calculate_sigpending(void); extern void signal_wake_up_state(struct task_struct *t, unsigned int state); static inline void signal_wake_up(struct task_struct *t, bool fatal) { unsigned int state = 0; if (fatal && !(t->jobctl & JOBCTL_PTRACE_FROZEN)) { t->jobctl &= ~(JOBCTL_STOPPED | JOBCTL_TRACED); state = TASK_WAKEKILL | __TASK_TRACED; } signal_wake_up_state(t, state); } static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume) { unsigned int state = 0; if (resume) { t->jobctl &= ~JOBCTL_TRACED; state = __TASK_TRACED; } signal_wake_up_state(t, state); } void task_join_group_stop(struct task_struct *task); #ifdef TIF_RESTORE_SIGMASK /* * Legacy restore_sigmask accessors. These are inefficient on * SMP architectures because they require atomic operations. */ /** * set_restore_sigmask() - make sure saved_sigmask processing gets done * * This sets TIF_RESTORE_SIGMASK and ensures that the arch signal code * will run before returning to user mode, to process the flag. For * all callers, TIF_SIGPENDING is already set or it's no harm to set * it. TIF_RESTORE_SIGMASK need not be in the set of bits that the * arch code will notice on return to user mode, in case those bits * are scarce. We set TIF_SIGPENDING here to ensure that the arch * signal code always gets run when TIF_RESTORE_SIGMASK is set. */ static inline void set_restore_sigmask(void) { set_thread_flag(TIF_RESTORE_SIGMASK); } static inline void clear_tsk_restore_sigmask(struct task_struct *task) { clear_tsk_thread_flag(task, TIF_RESTORE_SIGMASK); } static inline void clear_restore_sigmask(void) { clear_thread_flag(TIF_RESTORE_SIGMASK); } static inline bool test_tsk_restore_sigmask(struct task_struct *task) { return test_tsk_thread_flag(task, TIF_RESTORE_SIGMASK); } static inline bool test_restore_sigmask(void) { return test_thread_flag(TIF_RESTORE_SIGMASK); } static inline bool test_and_clear_restore_sigmask(void) { return test_and_clear_thread_flag(TIF_RESTORE_SIGMASK); } #else /* TIF_RESTORE_SIGMASK */ /* Higher-quality implementation, used if TIF_RESTORE_SIGMASK doesn't exist. */ static inline void set_restore_sigmask(void) { current->restore_sigmask = true; } static inline void clear_tsk_restore_sigmask(struct task_struct *task) { task->restore_sigmask = false; } static inline void clear_restore_sigmask(void) { current->restore_sigmask = false; } static inline bool test_restore_sigmask(void) { return current->restore_sigmask; } static inline bool test_tsk_restore_sigmask(struct task_struct *task) { return task->restore_sigmask; } static inline bool test_and_clear_restore_sigmask(void) { if (!current->restore_sigmask) return false; current->restore_sigmask = false; return true; } #endif static inline void restore_saved_sigmask(void) { if (test_and_clear_restore_sigmask()) __set_current_blocked(&current->saved_sigmask); } extern int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize); static inline void restore_saved_sigmask_unless(bool interrupted) { if (interrupted) WARN_ON(!signal_pending(current)); else restore_saved_sigmask(); } static inline sigset_t *sigmask_to_save(void) { sigset_t *res = &current->blocked; if (unlikely(test_restore_sigmask())) res = &current->saved_sigmask; return res; } static inline int kill_cad_pid(int sig, int priv) { return kill_pid(cad_pid, sig, priv); } /* These can be the second arg to send_sig_info/send_group_sig_info. */ #define SEND_SIG_NOINFO ((struct kernel_siginfo *) 0) #define SEND_SIG_PRIV ((struct kernel_siginfo *) 1) static inline int __on_sig_stack(unsigned long sp) { #ifdef CONFIG_STACK_GROWSUP return sp >= current->sas_ss_sp && sp - current->sas_ss_sp < current->sas_ss_size; #else return sp > current->sas_ss_sp && sp - current->sas_ss_sp <= current->sas_ss_size; #endif } /* * True if we are on the alternate signal stack. */ static inline int on_sig_stack(unsigned long sp) { /* * If the signal stack is SS_AUTODISARM then, by construction, we * can't be on the signal stack unless user code deliberately set * SS_AUTODISARM when we were already on it. * * This improves reliability: if user state gets corrupted such that * the stack pointer points very close to the end of the signal stack, * then this check will enable the signal to be handled anyway. */ if (current->sas_ss_flags & SS_AUTODISARM) return 0; return __on_sig_stack(sp); } static inline int sas_ss_flags(unsigned long sp) { if (!current->sas_ss_size) return SS_DISABLE; return on_sig_stack(sp) ? SS_ONSTACK : 0; } static inline void sas_ss_reset(struct task_struct *p) { p->sas_ss_sp = 0; p->sas_ss_size = 0; p->sas_ss_flags = SS_DISABLE; } static inline unsigned long sigsp(unsigned long sp, struct ksignal *ksig) { if (unlikely((ksig->ka.sa.sa_flags & SA_ONSTACK)) && ! sas_ss_flags(sp)) #ifdef CONFIG_STACK_GROWSUP return current->sas_ss_sp; #else return current->sas_ss_sp + current->sas_ss_size; #endif return sp; } extern void __cleanup_sighand(struct sighand_struct *); extern void flush_itimer_signals(void); #define tasklist_empty() \ list_empty(&init_task.tasks) #define next_task(p) \ list_entry_rcu((p)->tasks.next, struct task_struct, tasks) #define for_each_process(p) \ for (p = &init_task ; (p = next_task(p)) != &init_task ; ) extern bool current_is_single_threaded(void); /* * Without tasklist/siglock it is only rcu-safe if g can't exit/exec, * otherwise next_thread(t) will never reach g after list_del_rcu(g). */ #define while_each_thread(g, t) \ while ((t = next_thread(t)) != g) #define for_other_threads(p, t) \ for (t = p; (t = next_thread(t)) != p; ) #define __for_each_thread(signal, t) \ list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node, \ lockdep_is_held(&tasklist_lock)) #define for_each_thread(p, t) \ __for_each_thread((p)->signal, t) /* Careful: this is a double loop, 'break' won't work as expected. */ #define for_each_process_thread(p, t) \ for_each_process(p) for_each_thread(p, t) typedef int (*proc_visitor)(struct task_struct *p, void *data); void walk_process_tree(struct task_struct *top, proc_visitor, void *); static inline struct pid *task_pid_type(struct task_struct *task, enum pid_type type) { struct pid *pid; if (type == PIDTYPE_PID) pid = task_pid(task); else pid = task->signal->pids[type]; return pid; } static inline struct pid *task_tgid(struct task_struct *task) { return task->signal->pids[PIDTYPE_TGID]; } /* * Without tasklist or RCU lock it is not safe to dereference * the result of task_pgrp/task_session even if task == current, * we can race with another thread doing sys_setsid/sys_setpgid. */ static inline struct pid *task_pgrp(struct task_struct *task) { return task->signal->pids[PIDTYPE_PGID]; } static inline struct pid *task_session(struct task_struct *task) { return task->signal->pids[PIDTYPE_SID]; } static inline int get_nr_threads(struct task_struct *task) { return task->signal->nr_threads; } static inline bool thread_group_leader(struct task_struct *p) { return p->exit_signal >= 0; } static inline bool same_thread_group(struct task_struct *p1, struct task_struct *p2) { return p1->signal == p2->signal; } /* * returns NULL if p is the last thread in the thread group */ static inline struct task_struct *__next_thread(struct task_struct *p) { return list_next_or_null_rcu(&p->signal->thread_head, &p->thread_node, struct task_struct, thread_node); } static inline struct task_struct *next_thread(struct task_struct *p) { return __next_thread(p) ?: p->group_leader; } static inline int thread_group_empty(struct task_struct *p) { return thread_group_leader(p) && list_is_last(&p->thread_node, &p->signal->thread_head); } #define delay_group_leader(p) \ (thread_group_leader(p) && !thread_group_empty(p)) extern struct sighand_struct *__lock_task_sighand(struct task_struct *task, unsigned long *flags); static inline struct sighand_struct *lock_task_sighand(struct task_struct *task, unsigned long *flags) { struct sighand_struct *ret; ret = __lock_task_sighand(task, flags); (void)__cond_lock(&task->sighand->siglock, ret); return ret; } static inline void unlock_task_sighand(struct task_struct *task, unsigned long *flags) { spin_unlock_irqrestore(&task->sighand->siglock, *flags); } #ifdef CONFIG_LOCKDEP extern void lockdep_assert_task_sighand_held(struct task_struct *task); #else static inline void lockdep_assert_task_sighand_held(struct task_struct *task) { } #endif static inline unsigned long task_rlimit(const struct task_struct *task, unsigned int limit) { return READ_ONCE(task->signal->rlim[limit].rlim_cur); } static inline unsigned long task_rlimit_max(const struct task_struct *task, unsigned int limit) { return READ_ONCE(task->signal->rlim[limit].rlim_max); } static inline unsigned long rlimit(unsigned int limit) { return task_rlimit(current, limit); } static inline unsigned long rlimit_max(unsigned int limit) { return task_rlimit_max(current, limit); } #endif /* _LINUX_SCHED_SIGNAL_H */
7 5 1 9908 9904 9907 9901 9752 149 148 3 9908 28 23 14 11 25 11 16 6 4 25 25 25 8 10 7 8 25 25 25 12 19 6 3 25 25 3 3 2 3 3 19 2 3 2 2 3 3 2 3 3 3 3 3 3 3 3 2 7 14 14 7 6 6 7 3 8 9 3 3 9 6 5 123 124 124 124 109 14 109 124 124 124 29 50 24 12 24 3 26 7 16 5 9 5 4 14 9 9 6 11 114 17 17 1 1 17 124 124 110 14 124 124 124 129 17 9 3 5 6 143 9 161 6 5 30 5 44 54 135 129 31 22 53 8 3 8 124 167 167 46 124 124 40 2 24 15 7 7 123 123 120 4 132 3 131 2 129 132 25 109 1 1 1 26 1 15 14 10 1 2 7 2 1 5 4 1 5 10 353 352 321 114 21 11 2 2 7 4 1 4 3 2 18 7 11 1 5 3 1 2 11 3 3 5 3 2 24 24 11 11 11 12 3 1 2 1 1 2 2 19 3 1 2 13 3 2 2 3 5 1 8 3 1 2 2 22 15 67 12 32 8 5 10 2 1 310 310 6 183 40 40 222 223 2 93 94 323 229 96 229 327 2 324 20 321 322 71 71 58 14 59 59 52 7 9 9 8 1 89 10 10 10 10 11 11 10 10 36 36 11 4 4 4 4 4 17 17 17 5 6 3 3 3 6 6 6 3 6 6 3 3 3 3 6 7 1 6 12 8 4 14 8 8 6 3 5 2 3 3 6 3 3 3 3 3 1 2 6 2 23 2 1 19 18 3 14 2 2 3 2 8 2 8 1 5 12 4 4 12 12 4 4 7 4 8 12 6 3 4 3 4 4 6 29 29 31 29 21 4 4 4 4 4 16 3 13 7 1 2 2 7 3 2 2 9 4 2 3 17 17 17 17 17 23 36 36 25 23 23 25 23 102 36 17 23 29 30 27 26 13 5 14 60 61 57 7 55 6 61 46 43 4 6 1 2 5 9 49 61 61 61 49 49 38 11 2 2 2 2 17 16 5 12 11 11 11 11 11 11 11 11 7 4 5 5 5 3 7 6 5 5 5 3 2 11 11 11 11 11 24 24 15 9 24 24 33 33 2 2 9 13 13 18 18 10 10 3 3 5 5 2 2 12 4 13 2 11 9 1 8 4 4 10 3 7 5 2 2 2 4 8 4 4 5 4 126 126 67 22 76 76 11 11 2 3 2 2086 305 1866 85 6 2 4 2 4 1 67 2 383 3 1 4 19 5 4 15 1 3 17 3 3 11 3 4 7 3 2 2 2 3 1 15 1 17 3 10 3 2 2 6 3 4 5 5 2 6 3 3 2 1 5 6 8 2 3 2 1 1 1 2 39 2 1 4 2 1 5 1 209 220 2 8 4 6 15 12 35 16 7 2 3 1 13 2 2 2 2 2 1 2 2 3 2 131 5 91 2 2 3 2 1 6 1 1 1 6 2 1 2 1 58 3 64 2 1 4 3 2 4 3 1 1 2 42 3 100 4 2 9 4 5 3 3 5 3 2 2 2 2 2 1 48 4 83 2 78 4 159 4 7 6 2 2 3 1 6 4 123 4 56 5 51 132 3 3 1 1 6 48 5 2 1 4 3 4 4 1 2 56 17 1 2 1 13 444 22 348 149 146 59 9 11 7 9 25 98 132 91 1 41 166 71 62 36 97 86 1 1 57 52 3 27 350 550 14 25 11 18 10 14 13 8 484 484 215 160 60 212 2 8 26 1 1 18 2 6 8 4 8 206 207 202 1 1 7 6 69 25 16 10 9 1 9 123 83 83 164 27 134 10 140 200 2 1 198 18 10 6 87 76 42 1 1 40 18 13 17 15 10 8 5 1 1 1 2 2 206 96 16 3 2 2 1 3 1 1 3 3 3 2 2 2 1 2 2 41 40 1 1 54 22 23 1 1 2 13 2 2 1 4 2 1 1 2 1 1 6 2 75 33 24 1 1 1 1 1 4 4 1 1 1 2 88 4 88 7 2 2 3 1 1 1 138 17 1 2 5 1 1 2 1 1 1 1 1 3 8 2 3 3 8 4 6 4 2 5 5 2 3 5 2 4 4 5 10 2 5 3 6 2 1 1 1 2 6 1 6 1 66 40 7 40 46 94 10 1 2 1 3 3 3 2 5 71 9 2 7 6 1 34 1 2 2 2 6 25 59 1 1 34 4 1 2 10 7 12 29 37 2 7 2 8 1 1 3 9 4 16 1 2 2 11 2 85 1 1 85 70 1 2 66 34 1 5 5 4 3 3 2 4 4 842 768 209 8 11 10 8 9 8 9 710 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959 7960 7961 7962 7963 7964 7965 7966 7967 7968 7969 7970 7971 7972 7973 7974 7975 7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054 8055 8056 8057 8058 8059 8060 8061 8062 8063 8064 8065 8066 8067 8068 8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082 8083 8084 8085 8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097 8098 8099 8100 8101 8102 8103 8104 8105 8106 8107 8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119 8120 8121 8122 8123 8124 8125 8126 8127 8128 8129 8130 8131 8132 8133 8134 8135 8136 8137 8138 8139 8140 8141 8142 8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209 8210 8211 8212 8213 8214 8215 8216 8217 8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228 8229 8230 8231 8232 8233 8234 8235 8236 8237 8238 8239 8240 8241 8242 8243 8244 8245 8246 8247 8248 8249 8250 8251 8252 8253 8254 8255 8256 8257 8258 8259 8260 8261 8262 8263 8264 8265 8266 8267 8268 8269 8270 8271 8272 8273 8274 8275 8276 8277 8278 8279 8280 8281 8282 8283 8284 8285 8286 8287 8288 8289 8290 8291 8292 8293 8294 8295 8296 8297 8298 8299 8300 8301 8302 8303 8304 8305 8306 8307 8308 8309 8310 8311 8312 8313 8314 8315 8316 8317 8318 8319 8320 8321 8322 8323 8324 8325 8326 8327 8328 8329 8330 8331 8332 8333 8334 8335 8336 8337 8338 8339 8340 8341 8342 8343 8344 8345 8346 8347 8348 8349 8350 8351 8352 8353 8354 8355 8356 8357 8358 8359 8360 8361 8362 8363 8364 8365 8366 8367 8368 8369 8370 8371 8372 8373 8374 8375 8376 8377 8378 8379 8380 8381 8382 8383 8384 8385 8386 8387 8388 8389 8390 8391 8392 8393 8394 8395 8396 8397 8398 8399 8400 8401 8402 8403 8404 8405 8406 8407 8408 8409 8410 8411 8412 8413 8414 8415 8416 8417 8418 8419 8420 8421 8422 8423 8424 8425 8426 8427 8428 8429 8430 8431 8432 8433 8434 8435 8436 8437 8438 8439 8440 8441 8442 8443 8444 8445 8446 8447 8448 8449 8450 8451 8452 8453 8454 8455 8456 8457 8458 8459 8460 8461 8462 8463 8464 8465 8466 8467 8468 8469 8470 8471 8472 8473 8474 8475 8476 8477 8478 8479 8480 8481 8482 8483 8484 8485 8486 8487 8488 8489 8490 8491 8492 8493 8494 8495 8496 8497 8498 8499 8500 8501 8502 8503 8504 8505 8506 8507 8508 8509 8510 8511 8512 8513 8514 8515 8516 8517 8518 8519 8520 8521 8522 8523 8524 8525 8526 8527 8528 8529 8530 8531 8532 8533 8534 8535 8536 8537 8538 8539 8540 8541 8542 8543 8544 8545 8546 8547 8548 8549 8550 8551 8552 8553 8554 8555 8556 8557 8558 8559 8560 8561 8562 8563 8564 8565 8566 8567 8568 8569 8570 8571 8572 8573 8574 8575 8576 8577 8578 8579 8580 8581 8582 8583 8584 8585 8586 8587 8588 8589 8590 8591 8592 8593 8594 8595 8596 8597 8598 8599 8600 8601 8602 8603 8604 8605 8606 8607 8608 8609 8610 8611 8612 8613 8614 8615 8616 8617 8618 8619 8620 8621 8622 8623 8624 8625 8626 8627 8628 8629 8630 8631 8632 8633 8634 8635 8636 8637 8638 8639 8640 8641 8642 8643 8644 8645 8646 8647 8648 8649 8650 8651 8652 8653 8654 8655 8656 8657 8658 8659 8660 8661 8662 8663 8664 8665 8666 8667 8668 8669 8670 8671 8672 8673 8674 8675 8676 8677 8678 8679 8680 8681 8682 8683 8684 8685 8686 8687 8688 8689 8690 8691 8692 8693 8694 8695 8696 8697 8698 8699 8700 8701 8702 8703 8704 8705 8706 8707 8708 8709 8710 8711 8712 8713 8714 8715 8716 8717 8718 8719 8720 8721 8722 8723 8724 8725 8726 8727 8728 8729 8730 8731 8732 8733 8734 8735 8736 8737 8738 8739 8740 8741 8742 8743 8744 8745 8746 8747 8748 8749 8750 8751 8752 8753 8754 8755 8756 8757 8758 8759 8760 8761 8762 8763 8764 8765 8766 8767 8768 8769 8770 8771 8772 8773 8774 8775 8776 8777 8778 8779 8780 8781 8782 8783 8784 8785 8786 8787 8788 8789 8790 8791 8792 8793 8794 8795 8796 8797 8798 8799 8800 8801 8802 8803 8804 8805 8806 8807 8808 8809 8810 8811 8812 8813 8814 8815 8816 8817 8818 8819 8820 8821 8822 8823 8824 8825 8826 8827 8828 8829 8830 8831 8832 8833 8834 8835 8836 8837 8838 8839 8840 8841 8842 8843 8844 8845 8846 8847 8848 8849 8850 8851 8852 8853 8854 8855 8856 8857 8858 8859 8860 8861 8862 8863 8864 8865 8866 8867 8868 8869 8870 8871 8872 8873 8874 8875 8876 8877 8878 8879 8880 8881 8882 8883 8884 8885 8886 8887 8888 8889 8890 8891 8892 8893 8894 8895 8896 8897 8898 8899 8900 8901 8902 8903 8904 8905 8906 8907 8908 8909 8910 8911 8912 8913 8914 8915 8916 8917 8918 8919 8920 8921 8922 8923 8924 8925 8926 8927 8928 8929 8930 8931 8932 8933 8934 8935 8936 8937 8938 8939 8940 8941 8942 8943 8944 8945 8946 8947 8948 8949 8950 8951 8952 8953 8954 8955 8956 8957 8958 8959 8960 8961 8962 8963 8964 8965 8966 8967 8968 8969 8970 8971 8972 8973 8974 8975 8976 8977 8978 8979 8980 8981 8982 8983 8984 8985 8986 8987 8988 8989 8990 8991 8992 8993 8994 8995 8996 8997 8998 8999 9000 9001 9002 9003 9004 9005 9006 9007 9008 9009 9010 9011 9012 9013 9014 9015 9016 9017 9018 9019 9020 9021 9022 9023 9024 9025 9026 9027 9028 9029 9030 9031 9032 9033 9034 9035 9036 9037 9038 9039 9040 9041 9042 9043 9044 9045 9046 9047 9048 9049 9050 9051 9052 9053 9054 9055 9056 9057 9058 9059 9060 9061 9062 9063 9064 9065 9066 9067 9068 9069 9070 9071 9072 9073 9074 9075 9076 9077 9078 9079 9080 9081 9082 9083 9084 9085 9086 9087 9088 9089 9090 9091 9092 9093 9094 9095 9096 9097 9098 9099 9100 9101 9102 9103 9104 9105 9106 9107 9108 9109 9110 9111 9112 9113 9114 9115 9116 9117 9118 9119 9120 9121 9122 9123 9124 9125 9126 9127 9128 9129 9130 9131 9132 9133 9134 9135 9136 9137 9138 9139 9140 9141 9142 9143 9144 9145 9146 9147 9148 9149 9150 9151 9152 9153 9154 9155 9156 9157 9158 9159 9160 9161 9162 9163 9164 9165 9166 9167 9168 9169 9170 9171 9172 9173 9174 9175 9176 9177 9178 9179 9180 9181 9182 9183 9184 9185 9186 9187 9188 9189 9190 9191 9192 9193 9194 9195 9196 9197 9198 9199 9200 9201 9202 9203 9204 9205 9206 9207 9208 9209 9210 9211 9212 9213 9214 9215 9216 9217 9218 9219 9220 9221 9222 9223 9224 9225 9226 9227 9228 9229 9230 9231 9232 9233 9234 9235 9236 9237 9238 9239 9240 9241 9242 9243 9244 9245 9246 9247 9248 9249 9250 9251 9252 9253 9254 9255 9256 9257 9258 9259 9260 9261 9262 9263 9264 9265 9266 9267 9268 9269 9270 9271 9272 9273 9274 9275 9276 9277 9278 9279 9280 9281 9282 9283 9284 9285 9286 9287 9288 9289 9290 9291 9292 9293 9294 9295 9296 9297 9298 9299 9300 9301 9302 9303 9304 9305 9306 9307 9308 9309 9310 9311 9312 9313 9314 9315 9316 9317 9318 9319 9320 9321 9322 9323 9324 9325 9326 9327 9328 9329 9330 9331 9332 9333 9334 9335 9336 9337 9338 9339 9340 9341 9342 9343 9344 9345 9346 9347 9348 9349 9350 9351 9352 9353 9354 9355 9356 9357 9358 9359 9360 9361 9362 9363 9364 9365 9366 9367 9368 9369 9370 9371 9372 9373 9374 9375 9376 9377 9378 9379 9380 9381 9382 9383 9384 9385 9386 9387 9388 9389 9390 9391 9392 9393 9394 9395 9396 9397 9398 9399 9400 9401 9402 9403 9404 9405 9406 9407 9408 9409 9410 9411 9412 9413 9414 9415 9416 9417 9418 9419 9420 9421 9422 9423 9424 9425 9426 9427 9428 9429 9430 9431 9432 9433 9434 9435 9436 9437 9438 9439 9440 9441 9442 9443 9444 9445 9446 9447 9448 9449 9450 9451 9452 9453 9454 9455 9456 9457 9458 9459 9460 9461 9462 9463 9464 9465 9466 9467 9468 9469 9470 9471 9472 9473 9474 9475 9476 9477 9478 9479 9480 9481 9482 9483 9484 9485 9486 9487 9488 9489 9490 9491 9492 9493 9494 9495 9496 9497 9498 9499 9500 9501 9502 9503 9504 9505 9506 9507 9508 9509 9510 9511 9512 9513 9514 9515 9516 9517 9518 9519 9520 9521 9522 9523 9524 9525 9526 9527 9528 9529 9530 9531 9532 9533 9534 9535 9536 9537 9538 9539 9540 9541 9542 9543 9544 9545 9546 9547 9548 9549 9550 9551 9552 9553 9554 9555 9556 9557 9558 9559 9560 9561 9562 9563 9564 9565 9566 9567 9568 9569 9570 9571 9572 9573 9574 9575 9576 9577 9578 9579 9580 9581 9582 9583 9584 9585 9586 9587 9588 9589 9590 9591 9592 9593 9594 9595 9596 9597 9598 9599 9600 9601 9602 9603 9604 9605 9606 9607 9608 9609 9610 9611 9612 9613 9614 9615 9616 9617 9618 9619 9620 9621 9622 9623 9624 9625 9626 9627 9628 9629 9630 9631 9632 9633 9634 9635 9636 9637 9638 9639 9640 9641 9642 9643 9644 9645 9646 9647 9648 9649 9650 9651 9652 9653 9654 9655 9656 9657 9658 9659 9660 9661 9662 9663 9664 9665 9666 9667 9668 9669 9670 9671 9672 9673 9674 9675 9676 9677 9678 9679 9680 9681 9682 9683 9684 9685 9686 9687 9688 9689 9690 9691 9692 9693 9694 9695 9696 9697 9698 9699 9700 9701 9702 9703 9704 9705 9706 9707 9708 9709 9710 9711 9712 9713 9714 9715 9716 9717 9718 9719 9720 9721 9722 9723 9724 9725 9726 9727 9728 9729 9730 9731 9732 9733 9734 9735 9736 9737 9738 9739 9740 9741 9742 9743 9744 9745 9746 9747 9748 9749 9750 9751 9752 9753 9754 9755 9756 9757 9758 9759 9760 9761 9762 9763 9764 9765 9766 9767 9768 9769 9770 9771 9772 9773 9774 9775 9776 9777 9778 9779 9780 9781 9782 9783 9784 9785 9786 9787 9788 9789 9790 9791 9792 9793 9794 9795 9796 9797 9798 9799 9800 9801 9802 9803 9804 9805 9806 9807 9808 9809 9810 9811 9812 9813 9814 9815 9816 9817 9818 9819 9820 9821 9822 9823 9824 9825 9826 9827 9828 9829 9830 9831 9832 9833 9834 9835 9836 9837 9838 9839 9840 9841 9842 9843 9844 9845 9846 9847 9848 9849 9850 9851 9852 9853 9854 9855 9856 9857 9858 9859 9860 9861 9862 9863 9864 9865 9866 9867 9868 9869 9870 9871 9872 9873 9874 9875 9876 9877 9878 9879 9880 9881 9882 9883 9884 9885 9886 9887 9888 9889 9890 9891 9892 9893 9894 9895 9896 9897 9898 9899 9900 9901 9902 9903 9904 9905 9906 9907 9908 9909 9910 9911 9912 9913 9914 9915 9916 9917 9918 9919 9920 9921 9922 9923 9924 9925 9926 9927 9928 9929 9930 9931 9932 9933 9934 9935 9936 9937 9938 9939 9940 9941 9942 9943 9944 9945 9946 9947 9948 9949 9950 9951 9952 9953 9954 9955 9956 9957 9958 9959 9960 9961 9962 9963 9964 9965 9966 9967 9968 9969 9970 9971 9972 9973 9974 9975 9976 9977 9978 9979 9980 9981 9982 9983 9984 9985 9986 9987 9988 9989 9990 9991 9992 9993 9994 9995 9996 9997 9998 9999 10000 10001 10002 10003 10004 10005 10006 10007 10008 10009 10010 10011 10012 10013 10014 10015 10016 10017 10018 10019 10020 10021 10022 10023 10024 10025 10026 10027 10028 10029 10030 10031 10032 10033 10034 10035 10036 10037 10038 10039 10040 10041 10042 10043 10044 10045 10046 10047 10048 10049 10050 10051 10052 10053 10054 10055 10056 10057 10058 10059 10060 10061 10062 10063 10064 10065 10066 10067 10068 10069 10070 10071 10072 10073 10074 10075 10076 10077 10078 10079 10080 10081 10082 10083 10084 10085 10086 10087 10088 10089 10090 10091 10092 10093 10094 10095 10096 10097 10098 10099 10100 10101 10102 10103 10104 10105 10106 10107 10108 10109 10110 10111 10112 10113 10114 10115 10116 10117 10118 10119 10120 10121 10122 10123 10124 10125 10126 10127 10128 10129 10130 10131 10132 10133 10134 10135 10136 10137 10138 10139 10140 10141 10142 10143 10144 10145 10146 10147 10148 10149 10150 10151 10152 10153 10154 10155 10156 10157 10158 10159 10160 10161 10162 10163 10164 10165 10166 10167 10168 10169 10170 10171 10172 10173 10174 10175 10176 10177 10178 10179 10180 10181 10182 10183 10184 10185 10186 10187 10188 10189 10190 10191 10192 10193 10194 10195 10196 10197 10198 10199 10200 10201 10202 10203 10204 10205 10206 10207 10208 10209 10210 10211 10212 10213 10214 10215 10216 10217 10218 10219 10220 10221 10222 10223 10224 10225 10226 10227 10228 10229 10230 10231 10232 10233 10234 10235 10236 10237 10238 10239 10240 10241 10242 10243 10244 10245 10246 10247 10248 10249 10250 10251 10252 10253 10254 10255 10256 10257 10258 10259 10260 10261 10262 10263 10264 10265 10266 10267 10268 10269 10270 10271 10272 10273 10274 10275 10276 10277 10278 10279 10280 10281 10282 10283 10284 10285 10286 10287 10288 10289 10290 10291 10292 10293 10294 10295 10296 10297 10298 10299 10300 10301 10302 10303 10304 10305 10306 10307 10308 10309 10310 10311 10312 10313 10314 10315 10316 10317 10318 10319 10320 10321 10322 10323 10324 10325 10326 10327 10328 10329 10330 10331 10332 10333 10334 10335 10336 10337 10338 10339 10340 10341 10342 10343 10344 10345 10346 10347 10348 10349 10350 10351 10352 10353 10354 10355 10356 10357 10358 10359 10360 10361 10362 10363 10364 10365 10366 10367 10368 10369 10370 10371 10372 10373 10374 10375 10376 10377 10378 10379 10380 10381 10382 10383 10384 10385 10386 10387 10388 10389 10390 10391 10392 10393 10394 10395 10396 10397 10398 10399 10400 10401 10402 10403 10404 10405 10406 10407 10408 10409 10410 10411 10412 10413 10414 10415 10416 10417 10418 10419 10420 10421 10422 10423 10424 10425 10426 10427 10428 10429 10430 10431 10432 10433 10434 10435 10436 10437 10438 10439 10440 10441 10442 10443 10444 10445 10446 10447 10448 10449 10450 10451 10452 10453 10454 10455 10456 10457 10458 10459 10460 10461 10462 10463 10464 10465 10466 10467 10468 10469 10470 10471 10472 10473 10474 10475 10476 10477 10478 10479 10480 10481 10482 10483 10484 10485 10486 10487 10488 10489 10490 10491 10492 10493 10494 10495 10496 10497 10498 10499 10500 10501 10502 10503 10504 10505 10506 10507 10508 10509 10510 10511 10512 10513 10514 10515 10516 10517 10518 10519 10520 10521 10522 10523 10524 10525 10526 10527 10528 10529 10530 10531 10532 10533 10534 10535 10536 10537 10538 10539 10540 10541 10542 10543 10544 10545 10546 10547 10548 10549 10550 10551 10552 10553 10554 10555 10556 10557 10558 10559 10560 10561 10562 10563 10564 10565 10566 10567 10568 10569 10570 10571 10572 10573 10574 10575 10576 10577 10578 10579 10580 10581 10582 10583 10584 10585 10586 10587 10588 10589 10590 10591 10592 10593 10594 10595 10596 10597 10598 10599 10600 10601 10602 10603 10604 10605 10606 10607 10608 10609 10610 10611 10612 10613 10614 10615 10616 10617 10618 10619 10620 10621 10622 10623 10624 10625 10626 10627 10628 10629 10630 10631 10632 10633 10634 10635 10636 10637 10638 10639 10640 10641 10642 10643 10644 10645 10646 10647 10648 10649 10650 10651 10652 10653 10654 10655 10656 10657 10658 10659 10660 10661 10662 10663 10664 10665 10666 10667 10668 10669 10670 10671 10672 10673 10674 10675 10676 10677 10678 10679 10680 10681 10682 10683 10684 10685 10686 10687 10688 10689 10690 10691 10692 10693 10694 10695 10696 10697 10698 10699 10700 10701 10702 10703 10704 10705 10706 10707 10708 10709 10710 10711 10712 10713 10714 10715 10716 10717 10718 10719 10720 10721 10722 10723 10724 10725 10726 10727 10728 10729 10730 10731 10732 10733 10734 10735 10736 10737 10738 10739 10740 10741 10742 10743 10744 10745 10746 10747 10748 10749 10750 10751 10752 10753 10754 10755 10756 10757 10758 10759 10760 10761 10762 10763 10764 10765 10766 10767 10768 10769 10770 10771 10772 10773 10774 10775 10776 10777 10778 10779 10780 10781 10782 10783 10784 10785 10786 10787 10788 10789 10790 10791 10792 10793 10794 10795 10796 10797 10798 10799 10800 10801 10802 10803 10804 10805 10806 10807 10808 10809 10810 10811 10812 10813 10814 10815 10816 10817 10818 10819 10820 10821 10822 10823 10824 10825 10826 10827 10828 10829 10830 10831 10832 10833 10834 10835 10836 10837 10838 10839 10840 10841 10842 10843 10844 10845 10846 10847 10848 10849 10850 10851 10852 10853 10854 10855 10856 10857 10858 10859 10860 10861 10862 10863 10864 10865 10866 10867 10868 10869 10870 10871 10872 10873 10874 10875 10876 10877 10878 10879 10880 10881 10882 10883 10884 10885 10886 10887 10888 10889 10890 10891 10892 10893 10894 10895 10896 10897 10898 10899 10900 10901 10902 10903 10904 10905 10906 10907 10908 10909 10910 10911 10912 10913 10914 10915 10916 10917 10918 10919 10920 10921 10922 10923 10924 10925 10926 10927 10928 10929 10930 10931 10932 10933 10934 10935 10936 10937 10938 10939 10940 10941 10942 10943 10944 10945 10946 10947 10948 10949 10950 10951 10952 10953 10954 10955 10956 10957 10958 10959 10960 10961 10962 10963 10964 10965 10966 10967 10968 10969 10970 10971 10972 10973 10974 10975 10976 10977 10978 10979 10980 10981 10982 10983 10984 10985 10986 10987 10988 10989 10990 10991 10992 10993 10994 10995 10996 10997 10998 10999 11000 11001 11002 11003 11004 11005 11006 11007 11008 11009 11010 11011 11012 11013 11014 11015 11016 11017 11018 11019 11020 11021 11022 11023 11024 11025 11026 11027 11028 11029 11030 11031 11032 11033 11034 11035 11036 11037 11038 11039 11040 11041 11042 11043 11044 11045 11046 11047 11048 11049 11050 11051 11052 11053 11054 11055 11056 11057 11058 11059 11060 11061 11062 11063 11064 11065 11066 11067 11068 11069 11070 11071 11072 11073 11074 11075 11076 11077 11078 11079 11080 11081 11082 11083 11084 11085 11086 11087 11088 11089 11090 11091 11092 11093 11094 11095 11096 11097 11098 11099 11100 11101 11102 11103 11104 11105 11106 11107 11108 11109 11110 11111 11112 11113 11114 11115 11116 11117 11118 11119 11120 11121 11122 11123 11124 11125 11126 11127 11128 11129 11130 11131 11132 11133 11134 11135 11136 11137 11138 11139 11140 11141 11142 11143 11144 11145 11146 11147 11148 11149 11150 11151 11152 11153 11154 11155 11156 11157 11158 11159 11160 11161 11162 11163 11164 11165 11166 11167 11168 11169 11170 11171 11172 11173 11174 11175 11176 11177 11178 11179 11180 11181 11182 11183 11184 11185 11186 11187 11188 11189 11190 11191 11192 11193 11194 11195 11196 11197 11198 11199 11200 11201 11202 11203 11204 11205 11206 11207 11208 11209 11210 11211 11212 11213 11214 11215 11216 11217 11218 11219 11220 11221 11222 11223 11224 11225 11226 11227 11228 11229 11230 11231 11232 11233 11234 11235 11236 11237 11238 11239 11240 11241 11242 11243 11244 11245 11246 11247 11248 11249 11250 11251 11252 11253 11254 11255 11256 11257 11258 11259 11260 11261 11262 11263 11264 11265 11266 11267 11268 11269 11270 11271 11272 11273 11274 11275 11276 11277 11278 11279 11280 11281 11282 11283 11284 11285 11286 11287 11288 11289 11290 11291 11292 11293 11294 11295 11296 11297 11298 11299 11300 11301 11302 11303 11304 11305 11306 11307 11308 11309 11310 11311 11312 11313 11314 11315 11316 11317 11318 11319 11320 11321 11322 11323 11324 11325 11326 11327 11328 11329 11330 11331 11332 11333 11334 11335 11336 11337 11338 11339 11340 11341 11342 11343 11344 11345 11346 11347 11348 11349 11350 11351 11352 11353 11354 11355 11356 11357 11358 11359 11360 11361 11362 11363 11364 11365 11366 11367 11368 11369 11370 11371 11372 11373 11374 11375 11376 11377 11378 11379 11380 11381 11382 11383 11384 11385 11386 11387 11388 11389 11390 11391 11392 11393 11394 11395 11396 11397 11398 11399 11400 11401 11402 11403 11404 11405 11406 11407 11408 11409 11410 11411 11412 11413 11414 11415 11416 11417 11418 11419 11420 11421 11422 11423 11424 11425 11426 11427 11428 11429 11430 11431 11432 11433 11434 11435 11436 11437 11438 11439 11440 11441 11442 11443 11444 11445 11446 11447 11448 11449 11450 11451 11452 11453 11454 11455 11456 11457 11458 11459 11460 11461 11462 11463 11464 11465 11466 11467 11468 11469 11470 11471 11472 11473 11474 11475 11476 11477 11478 11479 11480 11481 11482 11483 11484 11485 11486 11487 11488 11489 11490 11491 11492 11493 11494 11495 11496 11497 11498 11499 11500 11501 11502 11503 11504 11505 11506 11507 11508 11509 11510 11511 11512 11513 11514 11515 11516 11517 11518 11519 11520 11521 11522 11523 11524 11525 11526 11527 11528 11529 11530 11531 11532 11533 11534 11535 11536 11537 11538 11539 11540 11541 11542 11543 11544 11545 11546 11547 11548 11549 11550 11551 11552 11553 11554 11555 11556 11557 11558 11559 11560 11561 11562 11563 11564 11565 11566 11567 11568 11569 11570 11571 11572 11573 11574 11575 11576 11577 11578 11579 11580 11581 11582 11583 11584 11585 11586 11587 11588 11589 11590 11591 11592 11593 11594 11595 11596 11597 11598 11599 11600 11601 11602 11603 11604 11605 11606 11607 11608 11609 11610 11611 11612 11613 11614 11615 11616 11617 11618 11619 11620 11621 11622 11623 11624 11625 11626 11627 11628 11629 11630 11631 11632 11633 11634 11635 11636 11637 11638 11639 11640 11641 11642 11643 11644 11645 11646 11647 11648 11649 11650 11651 11652 11653 11654 11655 11656 11657 11658 11659 11660 11661 11662 11663 11664 11665 11666 11667 11668 11669 11670 11671 11672 11673 11674 11675 11676 11677 11678 11679 11680 11681 11682 11683 11684 11685 11686 11687 11688 11689 11690 11691 11692 11693 11694 11695 11696 11697 11698 11699 11700 11701 11702 11703 11704 11705 11706 11707 11708 11709 11710 11711 11712 11713 11714 11715 11716 11717 11718 11719 11720 11721 11722 11723 11724 11725 11726 11727 11728 11729 11730 11731 11732 11733 11734 11735 11736 11737 11738 11739 11740 11741 11742 11743 11744 11745 11746 11747 11748 11749 11750 11751 11752 11753 11754 11755 11756 11757 11758 11759 11760 11761 11762 11763 11764 11765 11766 11767 11768 11769 11770 11771 11772 11773 11774 11775 11776 11777 11778 11779 11780 11781 11782 11783 11784 11785 11786 11787 11788 11789 11790 11791 11792 11793 11794 11795 11796 11797 11798 11799 11800 11801 11802 11803 11804 11805 11806 11807 11808 11809 11810 11811 11812 11813 11814 11815 11816 11817 11818 11819 11820 11821 11822 11823 11824 11825 11826 11827 11828 11829 11830 11831 11832 11833 11834 11835 11836 11837 11838 11839 11840 11841 11842 11843 11844 11845 11846 11847 11848 11849 11850 11851 11852 11853 11854 11855 11856 11857 11858 11859 11860 11861 11862 11863 11864 11865 11866 11867 11868 11869 11870 11871 11872 11873 11874 11875 11876 11877 11878 11879 11880 11881 11882 11883 11884 11885 11886 11887 11888 11889 11890 11891 11892 11893 11894 11895 11896 11897 11898 11899 11900 11901 11902 11903 11904 11905 11906 11907 11908 11909 11910 11911 11912 11913 11914 11915 11916 11917 11918 11919 11920 11921 11922 11923 11924 11925 11926 11927 11928 11929 11930 11931 11932 11933 11934 11935 11936 11937 11938 11939 11940 11941 11942 11943 11944 11945 11946 11947 11948 11949 11950 11951 11952 11953 11954 11955 11956 11957 11958 11959 11960 11961 11962 11963 11964 11965 11966 11967 11968 11969 11970 11971 11972 11973 11974 11975 11976 11977 11978 11979 11980 11981 11982 11983 11984 11985 11986 11987 11988 11989 11990 11991 11992 11993 11994 11995 11996 11997 11998 11999 12000 12001 12002 12003 12004 12005 12006 12007 12008 12009 12010 12011 12012 12013 12014 12015 12016 12017 12018 12019 12020 12021 12022 12023 12024 12025 12026 12027 12028 12029 12030 12031 12032 12033 12034 12035 12036 12037 12038 12039 12040 12041 12042 12043 12044 12045 12046 12047 12048 12049 12050 12051 12052 12053 12054 12055 12056 12057 12058 12059 12060 12061 12062 12063 12064 12065 12066 12067 12068 12069 12070 12071 12072 12073 12074 12075 12076 12077 12078 12079 12080 12081 12082 12083 12084 12085 12086 12087 12088 12089 12090 12091 12092 12093 12094 12095 12096 12097 12098 12099 12100 12101 12102 12103 12104 12105 12106 12107 12108 12109 12110 12111 12112 12113 12114 12115 12116 12117 12118 12119 12120 12121 12122 12123 12124 12125 12126 12127 12128 12129 12130 12131 12132 12133 12134 12135 12136 12137 12138 12139 12140 12141 12142 12143 12144 12145 12146 12147 12148 12149 12150 12151 12152 12153 12154 12155 12156 12157 12158 12159 12160 12161 12162 12163 12164 12165 12166 12167 12168 12169 12170 12171 12172 12173 12174 12175 12176 12177 12178 12179 12180 12181 12182 12183 12184 12185 12186 12187 12188 12189 12190 12191 12192 12193 12194 12195 12196 12197 12198 12199 12200 12201 12202 12203 12204 12205 12206 12207 12208 12209 12210 12211 12212 12213 12214 12215 12216 12217 12218 12219 12220 12221 12222 12223 12224 12225 12226 12227 12228 12229 12230 12231 12232 12233 12234 12235 12236 12237 12238 12239 12240 12241 12242 12243 12244 12245 12246 12247 12248 12249 12250 12251 12252 12253 12254 12255 12256 12257 12258 12259 12260 12261 12262 12263 12264 12265 12266 12267 12268 12269 12270 12271 12272 12273 12274 12275 12276 12277 12278 12279 12280 12281 12282 12283 12284 12285 12286 12287 12288 12289 12290 12291 12292 12293 12294 12295 12296 12297 12298 12299 12300 12301 12302 12303 12304 12305 12306 12307 12308 12309 12310 12311 12312 12313 12314 12315 12316 12317 12318 12319 12320 12321 12322 12323 12324 12325 12326 12327 12328 12329 12330 12331 12332 12333 12334 12335 12336 12337 12338 12339 12340 12341 12342 12343 12344 12345 12346 12347 12348 12349 12350 12351 12352 12353 12354 12355 12356 12357 12358 12359 12360 12361 12362 12363 12364 12365 12366 12367 12368 12369 12370 12371 12372 12373 12374 12375 12376 12377 12378 12379 12380 12381 12382 12383 12384 12385 12386 12387 12388 12389 12390 12391 12392 12393 12394 12395 12396 12397 12398 12399 12400 12401 12402 12403 12404 12405 12406 12407 12408 12409 12410 12411 12412 12413 12414 12415 12416 12417 12418 12419 12420 12421 12422 12423 12424 12425 12426 12427 12428 12429 12430 12431 12432 12433 12434 12435 12436 12437 12438 12439 12440 12441 12442 12443 12444 12445 12446 12447 12448 12449 12450 12451 12452 12453 12454 12455 12456 12457 12458 12459 12460 12461 12462 12463 12464 12465 12466 12467 12468 12469 12470 12471 12472 12473 12474 12475 12476 12477 12478 12479 12480 12481 12482 12483 12484 12485 12486 12487 12488 12489 12490 12491 12492 12493 12494 12495 12496 12497 12498 12499 12500 12501 12502 12503 12504 12505 12506 12507 12508 12509 12510 12511 12512 12513 12514 12515 12516 12517 12518 12519 12520 12521 12522 12523 12524 12525 12526 12527 12528 12529 12530 12531 12532 12533 12534 12535 12536 12537 12538 12539 12540 12541 12542 12543 12544 12545 12546 12547 12548 12549 12550 12551 12552 12553 12554 12555 12556 12557 12558 12559 12560 12561 12562 12563 12564 12565 12566 12567 12568 12569 12570 12571 12572 12573 12574 12575 12576 12577 12578 // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux Socket Filter - Kernel level socket filtering * * Based on the design of the Berkeley Packet Filter. The new * internal format has been designed by PLUMgrid: * * Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com * * Authors: * * Jay Schulist <jschlst@samba.org> * Alexei Starovoitov <ast@plumgrid.com> * Daniel Borkmann <dborkman@redhat.com> * * Andi Kleen - Fix a few bad bugs and races. * Kris Katterjohn - Added many additional checks in bpf_check_classic() */ #include <linux/atomic.h> #include <linux/bpf_verifier.h> #include <linux/module.h> #include <linux/types.h> #include <linux/mm.h> #include <linux/fcntl.h> #include <linux/socket.h> #include <linux/sock_diag.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/if_packet.h> #include <linux/if_arp.h> #include <linux/gfp.h> #include <net/inet_common.h> #include <net/ip.h> #include <net/protocol.h> #include <net/netlink.h> #include <linux/skbuff.h> #include <linux/skmsg.h> #include <net/sock.h> #include <net/flow_dissector.h> #include <linux/errno.h> #include <linux/timer.h> #include <linux/uaccess.h> #include <linux/unaligned.h> #include <linux/filter.h> #include <linux/ratelimit.h> #include <linux/seccomp.h> #include <linux/if_vlan.h> #include <linux/bpf.h> #include <linux/btf.h> #include <net/sch_generic.h> #include <net/cls_cgroup.h> #include <net/dst_metadata.h> #include <net/dst.h> #include <net/sock_reuseport.h> #include <net/busy_poll.h> #include <net/tcp.h> #include <net/xfrm.h> #include <net/udp.h> #include <linux/bpf_trace.h> #include <net/xdp_sock.h> #include <linux/inetdevice.h> #include <net/inet_hashtables.h> #include <net/inet6_hashtables.h> #include <net/ip_fib.h> #include <net/nexthop.h> #include <net/flow.h> #include <net/arp.h> #include <net/ipv6.h> #include <net/net_namespace.h> #include <linux/seg6_local.h> #include <net/seg6.h> #include <net/seg6_local.h> #include <net/lwtunnel.h> #include <net/ipv6_stubs.h> #include <net/bpf_sk_storage.h> #include <net/transp_v6.h> #include <linux/btf_ids.h> #include <net/tls.h> #include <net/xdp.h> #include <net/mptcp.h> #include <net/netfilter/nf_conntrack_bpf.h> #include <net/netkit.h> #include <linux/un.h> #include <net/xdp_sock_drv.h> #include <net/inet_dscp.h> #include "dev.h" /* Keep the struct bpf_fib_lookup small so that it fits into a cacheline */ static_assert(sizeof(struct bpf_fib_lookup) == 64, "struct bpf_fib_lookup size check"); static const struct bpf_func_proto * bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog); int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len) { if (in_compat_syscall()) { struct compat_sock_fprog f32; if (len != sizeof(f32)) return -EINVAL; if (copy_from_sockptr(&f32, src, sizeof(f32))) return -EFAULT; memset(dst, 0, sizeof(*dst)); dst->len = f32.len; dst->filter = compat_ptr(f32.filter); } else { if (len != sizeof(*dst)) return -EINVAL; if (copy_from_sockptr(dst, src, sizeof(*dst))) return -EFAULT; } return 0; } EXPORT_SYMBOL_GPL(copy_bpf_fprog_from_user); /** * sk_filter_trim_cap - run a packet through a socket filter * @sk: sock associated with &sk_buff * @skb: buffer to filter * @cap: limit on how short the eBPF program may trim the packet * @reason: record drop reason on errors (negative return value) * * Run the eBPF program and then cut skb->data to correct size returned by * the program. If pkt_len is 0 we toss packet. If skb->len is smaller * than pkt_len we keep whole skb->data. This is the socket level * wrapper to bpf_prog_run. It returns 0 if the packet should * be accepted or -EPERM if the packet should be tossed. * */ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap, enum skb_drop_reason *reason) { int err; struct sk_filter *filter; /* * If the skb was allocated from pfmemalloc reserves, only * allow SOCK_MEMALLOC sockets to use it as this socket is * helping free memory */ if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP); *reason = SKB_DROP_REASON_PFMEMALLOC; return -ENOMEM; } err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb); if (err) { *reason = SKB_DROP_REASON_SOCKET_FILTER; return err; } err = security_sock_rcv_skb(sk, skb); if (err) { *reason = SKB_DROP_REASON_SECURITY_HOOK; return err; } rcu_read_lock(); filter = rcu_dereference(sk->sk_filter); if (filter) { struct sock *save_sk = skb->sk; unsigned int pkt_len; skb->sk = sk; pkt_len = bpf_prog_run_save_cb(filter->prog, skb); skb->sk = save_sk; err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM; if (err) *reason = SKB_DROP_REASON_SOCKET_FILTER; } rcu_read_unlock(); return err; } EXPORT_SYMBOL(sk_filter_trim_cap); BPF_CALL_1(bpf_skb_get_pay_offset, struct sk_buff *, skb) { return skb_get_poff(skb); } BPF_CALL_3(bpf_skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x) { struct nlattr *nla; if (skb_is_nonlinear(skb)) return 0; if (skb->len < sizeof(struct nlattr)) return 0; if (a > skb->len - sizeof(struct nlattr)) return 0; nla = nla_find((struct nlattr *) &skb->data[a], skb->len - a, x); if (nla) return (void *) nla - (void *) skb->data; return 0; } BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x) { struct nlattr *nla; if (skb_is_nonlinear(skb)) return 0; if (skb->len < sizeof(struct nlattr)) return 0; if (a > skb->len - sizeof(struct nlattr)) return 0; nla = (struct nlattr *) &skb->data[a]; if (!nla_ok(nla, skb->len - a)) return 0; nla = nla_find_nested(nla, x); if (nla) return (void *) nla - (void *) skb->data; return 0; } static int bpf_skb_load_helper_convert_offset(const struct sk_buff *skb, int offset) { if (likely(offset >= 0)) return offset; if (offset >= SKF_NET_OFF) return offset - SKF_NET_OFF + skb_network_offset(skb); if (offset >= SKF_LL_OFF && skb_mac_header_was_set(skb)) return offset - SKF_LL_OFF + skb_mac_offset(skb); return INT_MIN; } BPF_CALL_4(bpf_skb_load_helper_8, const struct sk_buff *, skb, const void *, data, int, headlen, int, offset) { u8 tmp; const int len = sizeof(tmp); offset = bpf_skb_load_helper_convert_offset(skb, offset); if (offset == INT_MIN) return -EFAULT; if (headlen - offset >= len) return *(u8 *)(data + offset); if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) return tmp; else return -EFAULT; } BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb, int, offset) { return ____bpf_skb_load_helper_8(skb, skb->data, skb->len - skb->data_len, offset); } BPF_CALL_4(bpf_skb_load_helper_16, const struct sk_buff *, skb, const void *, data, int, headlen, int, offset) { __be16 tmp; const int len = sizeof(tmp); offset = bpf_skb_load_helper_convert_offset(skb, offset); if (offset == INT_MIN) return -EFAULT; if (headlen - offset >= len) return get_unaligned_be16(data + offset); if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) return be16_to_cpu(tmp); else return -EFAULT; } BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb, int, offset) { return ____bpf_skb_load_helper_16(skb, skb->data, skb->len - skb->data_len, offset); } BPF_CALL_4(bpf_skb_load_helper_32, const struct sk_buff *, skb, const void *, data, int, headlen, int, offset) { __be32 tmp; const int len = sizeof(tmp); offset = bpf_skb_load_helper_convert_offset(skb, offset); if (offset == INT_MIN) return -EFAULT; if (headlen - offset >= len) return get_unaligned_be32(data + offset); if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) return be32_to_cpu(tmp); else return -EFAULT; } BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb, int, offset) { return ____bpf_skb_load_helper_32(skb, skb->data, skb->len - skb->data_len, offset); } static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, struct bpf_insn *insn_buf) { struct bpf_insn *insn = insn_buf; switch (skb_field) { case SKF_AD_MARK: BUILD_BUG_ON(sizeof_field(struct sk_buff, mark) != 4); *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, offsetof(struct sk_buff, mark)); break; case SKF_AD_PKTTYPE: *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_TYPE_OFFSET); *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, PKT_TYPE_MAX); #ifdef __BIG_ENDIAN_BITFIELD *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 5); #endif break; case SKF_AD_QUEUE: BUILD_BUG_ON(sizeof_field(struct sk_buff, queue_mapping) != 2); *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, offsetof(struct sk_buff, queue_mapping)); break; case SKF_AD_VLAN_TAG: BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_tci) != 2); /* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */ *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, offsetof(struct sk_buff, vlan_tci)); break; case SKF_AD_VLAN_TAG_PRESENT: BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_all) != 4); *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, offsetof(struct sk_buff, vlan_all)); *insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1); *insn++ = BPF_ALU32_IMM(BPF_MOV, dst_reg, 1); break; } return insn - insn_buf; } static bool convert_bpf_extensions(struct sock_filter *fp, struct bpf_insn **insnp) { struct bpf_insn *insn = *insnp; u32 cnt; switch (fp->k) { case SKF_AD_OFF + SKF_AD_PROTOCOL: BUILD_BUG_ON(sizeof_field(struct sk_buff, protocol) != 2); /* A = *(u16 *) (CTX + offsetof(protocol)) */ *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, offsetof(struct sk_buff, protocol)); /* A = ntohs(A) [emitting a nop or swap16] */ *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16); break; case SKF_AD_OFF + SKF_AD_PKTTYPE: cnt = convert_skb_access(SKF_AD_PKTTYPE, BPF_REG_A, BPF_REG_CTX, insn); insn += cnt - 1; break; case SKF_AD_OFF + SKF_AD_IFINDEX: case SKF_AD_OFF + SKF_AD_HATYPE: BUILD_BUG_ON(sizeof_field(struct net_device, ifindex) != 4); BUILD_BUG_ON(sizeof_field(struct net_device, type) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), BPF_REG_TMP, BPF_REG_CTX, offsetof(struct sk_buff, dev)); /* if (tmp != 0) goto pc + 1 */ *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_TMP, 0, 1); *insn++ = BPF_EXIT_INSN(); if (fp->k == SKF_AD_OFF + SKF_AD_IFINDEX) *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_TMP, offsetof(struct net_device, ifindex)); else *insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_TMP, offsetof(struct net_device, type)); break; case SKF_AD_OFF + SKF_AD_MARK: cnt = convert_skb_access(SKF_AD_MARK, BPF_REG_A, BPF_REG_CTX, insn); insn += cnt - 1; break; case SKF_AD_OFF + SKF_AD_RXHASH: BUILD_BUG_ON(sizeof_field(struct sk_buff, hash) != 4); *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, offsetof(struct sk_buff, hash)); break; case SKF_AD_OFF + SKF_AD_QUEUE: cnt = convert_skb_access(SKF_AD_QUEUE, BPF_REG_A, BPF_REG_CTX, insn); insn += cnt - 1; break; case SKF_AD_OFF + SKF_AD_VLAN_TAG: cnt = convert_skb_access(SKF_AD_VLAN_TAG, BPF_REG_A, BPF_REG_CTX, insn); insn += cnt - 1; break; case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT: cnt = convert_skb_access(SKF_AD_VLAN_TAG_PRESENT, BPF_REG_A, BPF_REG_CTX, insn); insn += cnt - 1; break; case SKF_AD_OFF + SKF_AD_VLAN_TPID: BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_proto) != 2); /* A = *(u16 *) (CTX + offsetof(vlan_proto)) */ *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, offsetof(struct sk_buff, vlan_proto)); /* A = ntohs(A) [emitting a nop or swap16] */ *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16); break; case SKF_AD_OFF + SKF_AD_PAY_OFFSET: case SKF_AD_OFF + SKF_AD_NLATTR: case SKF_AD_OFF + SKF_AD_NLATTR_NEST: case SKF_AD_OFF + SKF_AD_CPU: case SKF_AD_OFF + SKF_AD_RANDOM: /* arg1 = CTX */ *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX); /* arg2 = A */ *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_A); /* arg3 = X */ *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_X); /* Emit call(arg1=CTX, arg2=A, arg3=X) */ switch (fp->k) { case SKF_AD_OFF + SKF_AD_PAY_OFFSET: *insn = BPF_EMIT_CALL(bpf_skb_get_pay_offset); break; case SKF_AD_OFF + SKF_AD_NLATTR: *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr); break; case SKF_AD_OFF + SKF_AD_NLATTR_NEST: *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr_nest); break; case SKF_AD_OFF + SKF_AD_CPU: *insn = BPF_EMIT_CALL(bpf_get_raw_cpu_id); break; case SKF_AD_OFF + SKF_AD_RANDOM: *insn = BPF_EMIT_CALL(bpf_user_rnd_u32); bpf_user_rnd_init_once(); break; } break; case SKF_AD_OFF + SKF_AD_ALU_XOR_X: /* A ^= X */ *insn = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_X); break; default: /* This is just a dummy call to avoid letting the compiler * evict __bpf_call_base() as an optimization. Placed here * where no-one bothers. */ BUG_ON(__bpf_call_base(0, 0, 0, 0, 0) != 0); return false; } *insnp = insn; return true; } static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp) { const bool unaligned_ok = IS_BUILTIN(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS); int size = bpf_size_to_bytes(BPF_SIZE(fp->code)); bool endian = BPF_SIZE(fp->code) == BPF_H || BPF_SIZE(fp->code) == BPF_W; bool indirect = BPF_MODE(fp->code) == BPF_IND; const int ip_align = NET_IP_ALIGN; struct bpf_insn *insn = *insnp; int offset = fp->k; if (!indirect && ((unaligned_ok && offset >= 0) || (!unaligned_ok && offset >= 0 && offset + ip_align >= 0 && offset + ip_align % size == 0))) { bool ldx_off_ok = offset <= S16_MAX; *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H); if (offset) *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset); *insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP, size, 2 + endian + (!ldx_off_ok * 2)); if (ldx_off_ok) { *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A, BPF_REG_D, offset); } else { *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_D); *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_TMP, offset); *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A, BPF_REG_TMP, 0); } if (endian) *insn++ = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, size * 8); *insn++ = BPF_JMP_A(8); } *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX); *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_D); *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_H); if (!indirect) { *insn++ = BPF_MOV64_IMM(BPF_REG_ARG4, offset); } else { *insn++ = BPF_MOV64_REG(BPF_REG_ARG4, BPF_REG_X); if (fp->k) *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG4, offset); } switch (BPF_SIZE(fp->code)) { case BPF_B: *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8); break; case BPF_H: *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16); break; case BPF_W: *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32); break; default: return false; } *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_A, 0, 2); *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); *insn = BPF_EXIT_INSN(); *insnp = insn; return true; } /** * bpf_convert_filter - convert filter program * @prog: the user passed filter program * @len: the length of the user passed filter program * @new_prog: allocated 'struct bpf_prog' or NULL * @new_len: pointer to store length of converted program * @seen_ld_abs: bool whether we've seen ld_abs/ind * * Remap 'sock_filter' style classic BPF (cBPF) instruction set to 'bpf_insn' * style extended BPF (eBPF). * Conversion workflow: * * 1) First pass for calculating the new program length: * bpf_convert_filter(old_prog, old_len, NULL, &new_len, &seen_ld_abs) * * 2) 2nd pass to remap in two passes: 1st pass finds new * jump offsets, 2nd pass remapping: * bpf_convert_filter(old_prog, old_len, new_prog, &new_len, &seen_ld_abs) */ static int bpf_convert_filter(struct sock_filter *prog, int len, struct bpf_prog *new_prog, int *new_len, bool *seen_ld_abs) { int new_flen = 0, pass = 0, target, i, stack_off; struct bpf_insn *new_insn, *first_insn = NULL; struct sock_filter *fp; int *addrs = NULL; u8 bpf_src; BUILD_BUG_ON(BPF_MEMWORDS * sizeof(u32) > MAX_BPF_STACK); BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); if (len <= 0 || len > BPF_MAXINSNS) return -EINVAL; if (new_prog) { first_insn = new_prog->insnsi; addrs = kcalloc(len, sizeof(*addrs), GFP_KERNEL | __GFP_NOWARN); if (!addrs) return -ENOMEM; } do_pass: new_insn = first_insn; fp = prog; /* Classic BPF related prologue emission. */ if (new_prog) { /* Classic BPF expects A and X to be reset first. These need * to be guaranteed to be the first two instructions. */ *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_X, BPF_REG_X); /* All programs must keep CTX in callee saved BPF_REG_CTX. * In eBPF case it's done by the compiler, here we need to * do this ourself. Initial CTX is present in BPF_REG_ARG1. */ *new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1); if (*seen_ld_abs) { /* For packet access in classic BPF, cache skb->data * in callee-saved BPF R8 and skb->len - skb->data_len * (headlen) in BPF R9. Since classic BPF is read-only * on CTX, we only need to cache it once. */ *new_insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), BPF_REG_D, BPF_REG_CTX, offsetof(struct sk_buff, data)); *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_H, BPF_REG_CTX, offsetof(struct sk_buff, len)); *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_TMP, BPF_REG_CTX, offsetof(struct sk_buff, data_len)); *new_insn++ = BPF_ALU32_REG(BPF_SUB, BPF_REG_H, BPF_REG_TMP); } } else { new_insn += 3; } for (i = 0; i < len; fp++, i++) { struct bpf_insn tmp_insns[32] = { }; struct bpf_insn *insn = tmp_insns; if (addrs) addrs[i] = new_insn - first_insn; switch (fp->code) { /* All arithmetic insns and skb loads map as-is. */ case BPF_ALU | BPF_ADD | BPF_X: case BPF_ALU | BPF_ADD | BPF_K: case BPF_ALU | BPF_SUB | BPF_X: case BPF_ALU | BPF_SUB | BPF_K: case BPF_ALU | BPF_AND | BPF_X: case BPF_ALU | BPF_AND | BPF_K: case BPF_ALU | BPF_OR | BPF_X: case BPF_ALU | BPF_OR | BPF_K: case BPF_ALU | BPF_LSH | BPF_X: case BPF_ALU | BPF_LSH | BPF_K: case BPF_ALU | BPF_RSH | BPF_X: case BPF_ALU | BPF_RSH | BPF_K: case BPF_ALU | BPF_XOR | BPF_X: case BPF_ALU | BPF_XOR | BPF_K: case BPF_ALU | BPF_MUL | BPF_X: case BPF_ALU | BPF_MUL | BPF_K: case BPF_ALU | BPF_DIV | BPF_X: case BPF_ALU | BPF_DIV | BPF_K: case BPF_ALU | BPF_MOD | BPF_X: case BPF_ALU | BPF_MOD | BPF_K: case BPF_ALU | BPF_NEG: case BPF_LD | BPF_ABS | BPF_W: case BPF_LD | BPF_ABS | BPF_H: case BPF_LD | BPF_ABS | BPF_B: case BPF_LD | BPF_IND | BPF_W: case BPF_LD | BPF_IND | BPF_H: case BPF_LD | BPF_IND | BPF_B: /* Check for overloaded BPF extension and * directly convert it if found, otherwise * just move on with mapping. */ if (BPF_CLASS(fp->code) == BPF_LD && BPF_MODE(fp->code) == BPF_ABS && convert_bpf_extensions(fp, &insn)) break; if (BPF_CLASS(fp->code) == BPF_LD && convert_bpf_ld_abs(fp, &insn)) { *seen_ld_abs = true; break; } if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) || fp->code == (BPF_ALU | BPF_MOD | BPF_X)) { *insn++ = BPF_MOV32_REG(BPF_REG_X, BPF_REG_X); /* Error with exception code on div/mod by 0. * For cBPF programs, this was always return 0. */ *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_X, 0, 2); *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); *insn++ = BPF_EXIT_INSN(); } *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k); break; /* Jump transformation cannot use BPF block macros * everywhere as offset calculation and target updates * require a bit more work than the rest, i.e. jump * opcodes map as-is, but offsets need adjustment. */ #define BPF_EMIT_JMP \ do { \ const s32 off_min = S16_MIN, off_max = S16_MAX; \ s32 off; \ \ if (target >= len || target < 0) \ goto err; \ off = addrs ? addrs[target] - addrs[i] - 1 : 0; \ /* Adjust pc relative offset for 2nd or 3rd insn. */ \ off -= insn - tmp_insns; \ /* Reject anything not fitting into insn->off. */ \ if (off < off_min || off > off_max) \ goto err; \ insn->off = off; \ } while (0) case BPF_JMP | BPF_JA: target = i + fp->k + 1; insn->code = fp->code; BPF_EMIT_JMP; break; case BPF_JMP | BPF_JEQ | BPF_K: case BPF_JMP | BPF_JEQ | BPF_X: case BPF_JMP | BPF_JSET | BPF_K: case BPF_JMP | BPF_JSET | BPF_X: case BPF_JMP | BPF_JGT | BPF_K: case BPF_JMP | BPF_JGT | BPF_X: case BPF_JMP | BPF_JGE | BPF_K: case BPF_JMP | BPF_JGE | BPF_X: if (BPF_SRC(fp->code) == BPF_K && (int) fp->k < 0) { /* BPF immediates are signed, zero extend * immediate into tmp register and use it * in compare insn. */ *insn++ = BPF_MOV32_IMM(BPF_REG_TMP, fp->k); insn->dst_reg = BPF_REG_A; insn->src_reg = BPF_REG_TMP; bpf_src = BPF_X; } else { insn->dst_reg = BPF_REG_A; insn->imm = fp->k; bpf_src = BPF_SRC(fp->code); insn->src_reg = bpf_src == BPF_X ? BPF_REG_X : 0; } /* Common case where 'jump_false' is next insn. */ if (fp->jf == 0) { insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src; target = i + fp->jt + 1; BPF_EMIT_JMP; break; } /* Convert some jumps when 'jump_true' is next insn. */ if (fp->jt == 0) { switch (BPF_OP(fp->code)) { case BPF_JEQ: insn->code = BPF_JMP | BPF_JNE | bpf_src; break; case BPF_JGT: insn->code = BPF_JMP | BPF_JLE | bpf_src; break; case BPF_JGE: insn->code = BPF_JMP | BPF_JLT | bpf_src; break; default: goto jmp_rest; } target = i + fp->jf + 1; BPF_EMIT_JMP; break; } jmp_rest: /* Other jumps are mapped into two insns: Jxx and JA. */ target = i + fp->jt + 1; insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src; BPF_EMIT_JMP; insn++; insn->code = BPF_JMP | BPF_JA; target = i + fp->jf + 1; BPF_EMIT_JMP; break; /* ldxb 4 * ([14] & 0xf) is remapped into 6 insns. */ case BPF_LDX | BPF_MSH | BPF_B: { struct sock_filter tmp = { .code = BPF_LD | BPF_ABS | BPF_B, .k = fp->k, }; *seen_ld_abs = true; /* X = A */ *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); /* A = BPF_R0 = *(u8 *) (skb->data + K) */ convert_bpf_ld_abs(&tmp, &insn); insn++; /* A &= 0xf */ *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf); /* A <<= 2 */ *insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2); /* tmp = X */ *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_X); /* X = A */ *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); /* A = tmp */ *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP); break; } /* RET_K is remapped into 2 insns. RET_A case doesn't need an * extra mov as BPF_REG_0 is already mapped into BPF_REG_A. */ case BPF_RET | BPF_A: case BPF_RET | BPF_K: if (BPF_RVAL(fp->code) == BPF_K) *insn++ = BPF_MOV32_RAW(BPF_K, BPF_REG_0, 0, fp->k); *insn = BPF_EXIT_INSN(); break; /* Store to stack. */ case BPF_ST: case BPF_STX: stack_off = fp->k * 4 + 4; *insn = BPF_STX_MEM(BPF_W, BPF_REG_FP, BPF_CLASS(fp->code) == BPF_ST ? BPF_REG_A : BPF_REG_X, -stack_off); /* check_load_and_stores() verifies that classic BPF can * load from stack only after write, so tracking * stack_depth for ST|STX insns is enough */ if (new_prog && new_prog->aux->stack_depth < stack_off) new_prog->aux->stack_depth = stack_off; break; /* Load from stack. */ case BPF_LD | BPF_MEM: case BPF_LDX | BPF_MEM: stack_off = fp->k * 4 + 4; *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ? BPF_REG_A : BPF_REG_X, BPF_REG_FP, -stack_off); break; /* A = K or X = K */ case BPF_LD | BPF_IMM: case BPF_LDX | BPF_IMM: *insn = BPF_MOV32_IMM(BPF_CLASS(fp->code) == BPF_LD ? BPF_REG_A : BPF_REG_X, fp->k); break; /* X = A */ case BPF_MISC | BPF_TAX: *insn = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); break; /* A = X */ case BPF_MISC | BPF_TXA: *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_X); break; /* A = skb->len or X = skb->len */ case BPF_LD | BPF_W | BPF_LEN: case BPF_LDX | BPF_W | BPF_LEN: *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ? BPF_REG_A : BPF_REG_X, BPF_REG_CTX, offsetof(struct sk_buff, len)); break; /* Access seccomp_data fields. */ case BPF_LDX | BPF_ABS | BPF_W: /* A = *(u32 *) (ctx + K) */ *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, fp->k); break; /* Unknown instruction. */ default: goto err; } insn++; if (new_prog) memcpy(new_insn, tmp_insns, sizeof(*insn) * (insn - tmp_insns)); new_insn += insn - tmp_insns; } if (!new_prog) { /* Only calculating new length. */ *new_len = new_insn - first_insn; if (*seen_ld_abs) *new_len += 4; /* Prologue bits. */ return 0; } pass++; if (new_flen != new_insn - first_insn) { new_flen = new_insn - first_insn; if (pass > 2) goto err; goto do_pass; } kfree(addrs); BUG_ON(*new_len != new_flen); return 0; err: kfree(addrs); return -EINVAL; } /* Security: * * As we dont want to clear mem[] array for each packet going through * __bpf_prog_run(), we check that filter loaded by user never try to read * a cell if not previously written, and we check all branches to be sure * a malicious user doesn't try to abuse us. */ static int check_load_and_stores(const struct sock_filter *filter, int flen) { u16 *masks, memvalid = 0; /* One bit per cell, 16 cells */ int pc, ret = 0; BUILD_BUG_ON(BPF_MEMWORDS > 16); masks = kmalloc_array(flen, sizeof(*masks), GFP_KERNEL); if (!masks) return -ENOMEM; memset(masks, 0xff, flen * sizeof(*masks)); for (pc = 0; pc < flen; pc++) { memvalid &= masks[pc]; switch (filter[pc].code) { case BPF_ST: case BPF_STX: memvalid |= (1 << filter[pc].k); break; case BPF_LD | BPF_MEM: case BPF_LDX | BPF_MEM: if (!(memvalid & (1 << filter[pc].k))) { ret = -EINVAL; goto error; } break; case BPF_JMP | BPF_JA: /* A jump must set masks on target */ masks[pc + 1 + filter[pc].k] &= memvalid; memvalid = ~0; break; case BPF_JMP | BPF_JEQ | BPF_K: case BPF_JMP | BPF_JEQ | BPF_X: case BPF_JMP | BPF_JGE | BPF_K: case BPF_JMP | BPF_JGE | BPF_X: case BPF_JMP | BPF_JGT | BPF_K: case BPF_JMP | BPF_JGT | BPF_X: case BPF_JMP | BPF_JSET | BPF_K: case BPF_JMP | BPF_JSET | BPF_X: /* A jump must set masks on targets */ masks[pc + 1 + filter[pc].jt] &= memvalid; masks[pc + 1 + filter[pc].jf] &= memvalid; memvalid = ~0; break; } } error: kfree(masks); return ret; } static bool chk_code_allowed(u16 code_to_probe) { static const bool codes[] = { /* 32 bit ALU operations */ [BPF_ALU | BPF_ADD | BPF_K] = true, [BPF_ALU | BPF_ADD | BPF_X] = true, [BPF_ALU | BPF_SUB | BPF_K] = true, [BPF_ALU | BPF_SUB | BPF_X] = true, [BPF_ALU | BPF_MUL | BPF_K] = true, [BPF_ALU | BPF_MUL | BPF_X] = true, [BPF_ALU | BPF_DIV | BPF_K] = true, [BPF_ALU | BPF_DIV | BPF_X] = true, [BPF_ALU | BPF_MOD | BPF_K] = true, [BPF_ALU | BPF_MOD | BPF_X] = true, [BPF_ALU | BPF_AND | BPF_K] = true, [BPF_ALU | BPF_AND | BPF_X] = true, [BPF_ALU | BPF_OR | BPF_K] = true, [BPF_ALU | BPF_OR | BPF_X] = true, [BPF_ALU | BPF_XOR | BPF_K] = true, [BPF_ALU | BPF_XOR | BPF_X] = true, [BPF_ALU | BPF_LSH | BPF_K] = true, [BPF_ALU | BPF_LSH | BPF_X] = true, [BPF_ALU | BPF_RSH | BPF_K] = true, [BPF_ALU | BPF_RSH | BPF_X] = true, [BPF_ALU | BPF_NEG] = true, /* Load instructions */ [BPF_LD | BPF_W | BPF_ABS] = true, [BPF_LD | BPF_H | BPF_ABS] = true, [BPF_LD | BPF_B | BPF_ABS] = true, [BPF_LD | BPF_W | BPF_LEN] = true, [BPF_LD | BPF_W | BPF_IND] = true, [BPF_LD | BPF_H | BPF_IND] = true, [BPF_LD | BPF_B | BPF_IND] = true, [BPF_LD | BPF_IMM] = true, [BPF_LD | BPF_MEM] = true, [BPF_LDX | BPF_W | BPF_LEN] = true, [BPF_LDX | BPF_B | BPF_MSH] = true, [BPF_LDX | BPF_IMM] = true, [BPF_LDX | BPF_MEM] = true, /* Store instructions */ [BPF_ST] = true, [BPF_STX] = true, /* Misc instructions */ [BPF_MISC | BPF_TAX] = true, [BPF_MISC | BPF_TXA] = true, /* Return instructions */ [BPF_RET | BPF_K] = true, [BPF_RET | BPF_A] = true, /* Jump instructions */ [BPF_JMP | BPF_JA] = true, [BPF_JMP | BPF_JEQ | BPF_K] = true, [BPF_JMP | BPF_JEQ | BPF_X] = true, [BPF_JMP | BPF_JGE | BPF_K] = true, [BPF_JMP | BPF_JGE | BPF_X] = true, [BPF_JMP | BPF_JGT | BPF_K] = true, [BPF_JMP | BPF_JGT | BPF_X] = true, [BPF_JMP | BPF_JSET | BPF_K] = true, [BPF_JMP | BPF_JSET | BPF_X] = true, }; if (code_to_probe >= ARRAY_SIZE(codes)) return false; return codes[code_to_probe]; } static bool bpf_check_basics_ok(const struct sock_filter *filter, unsigned int flen) { if (filter == NULL) return false; if (flen == 0 || flen > BPF_MAXINSNS) return false; return true; } /** * bpf_check_classic - verify socket filter code * @filter: filter to verify * @flen: length of filter * * Check the user's filter code. If we let some ugly * filter code slip through kaboom! The filter must contain * no references or jumps that are out of range, no illegal * instructions, and must end with a RET instruction. * * All jumps are forward as they are not signed. * * Returns 0 if the rule set is legal or -EINVAL if not. */ static int bpf_check_classic(const struct sock_filter *filter, unsigned int flen) { bool anc_found; int pc; /* Check the filter code now */ for (pc = 0; pc < flen; pc++) { const struct sock_filter *ftest = &filter[pc]; /* May we actually operate on this code? */ if (!chk_code_allowed(ftest->code)) return -EINVAL; /* Some instructions need special checks */ switch (ftest->code) { case BPF_ALU | BPF_DIV | BPF_K: case BPF_ALU | BPF_MOD | BPF_K: /* Check for division by zero */ if (ftest->k == 0) return -EINVAL; break; case BPF_ALU | BPF_LSH | BPF_K: case BPF_ALU | BPF_RSH | BPF_K: if (ftest->k >= 32) return -EINVAL; break; case BPF_LD | BPF_MEM: case BPF_LDX | BPF_MEM: case BPF_ST: case BPF_STX: /* Check for invalid memory addresses */ if (ftest->k >= BPF_MEMWORDS) return -EINVAL; break; case BPF_JMP | BPF_JA: /* Note, the large ftest->k might cause loops. * Compare this with conditional jumps below, * where offsets are limited. --ANK (981016) */ if (ftest->k >= (unsigned int)(flen - pc - 1)) return -EINVAL; break; case BPF_JMP | BPF_JEQ | BPF_K: case BPF_JMP | BPF_JEQ | BPF_X: case BPF_JMP | BPF_JGE | BPF_K: case BPF_JMP | BPF_JGE | BPF_X: case BPF_JMP | BPF_JGT | BPF_K: case BPF_JMP | BPF_JGT | BPF_X: case BPF_JMP | BPF_JSET | BPF_K: case BPF_JMP | BPF_JSET | BPF_X: /* Both conditionals must be safe */ if (pc + ftest->jt + 1 >= flen || pc + ftest->jf + 1 >= flen) return -EINVAL; break; case BPF_LD | BPF_W | BPF_ABS: case BPF_LD | BPF_H | BPF_ABS: case BPF_LD | BPF_B | BPF_ABS: anc_found = false; if (bpf_anc_helper(ftest) & BPF_ANC) anc_found = true; /* Ancillary operation unknown or unsupported */ if (anc_found == false && ftest->k >= SKF_AD_OFF) return -EINVAL; } } /* Last instruction must be a RET code */ switch (filter[flen - 1].code) { case BPF_RET | BPF_K: case BPF_RET | BPF_A: return check_load_and_stores(filter, flen); } return -EINVAL; } static int bpf_prog_store_orig_filter(struct bpf_prog *fp, const struct sock_fprog *fprog) { unsigned int fsize = bpf_classic_proglen(fprog); struct sock_fprog_kern *fkprog; fp->orig_prog = kmalloc(sizeof(*fkprog), GFP_KERNEL); if (!fp->orig_prog) return -ENOMEM; fkprog = fp->orig_prog; fkprog->len = fprog->len; fkprog->filter = kmemdup(fp->insns, fsize, GFP_KERNEL | __GFP_NOWARN); if (!fkprog->filter) { kfree(fp->orig_prog); return -ENOMEM; } return 0; } static void bpf_release_orig_filter(struct bpf_prog *fp) { struct sock_fprog_kern *fprog = fp->orig_prog; if (fprog) { kfree(fprog->filter); kfree(fprog); } } static void __bpf_prog_release(struct bpf_prog *prog) { if (prog->type == BPF_PROG_TYPE_SOCKET_FILTER) { bpf_prog_put(prog); } else { bpf_release_orig_filter(prog); bpf_prog_free(prog); } } static void __sk_filter_release(struct sk_filter *fp) { __bpf_prog_release(fp->prog); kfree(fp); } /** * sk_filter_release_rcu - Release a socket filter by rcu_head * @rcu: rcu_head that contains the sk_filter to free */ static void sk_filter_release_rcu(struct rcu_head *rcu) { struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu); __sk_filter_release(fp); } /** * sk_filter_release - release a socket filter * @fp: filter to remove * * Remove a filter from a socket and release its resources. */ static void sk_filter_release(struct sk_filter *fp) { if (refcount_dec_and_test(&fp->refcnt)) call_rcu(&fp->rcu, sk_filter_release_rcu); } void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp) { u32 filter_size = bpf_prog_size(fp->prog->len); atomic_sub(filter_size, &sk->sk_omem_alloc); sk_filter_release(fp); } /* try to charge the socket memory if there is space available * return true on success */ static bool __sk_filter_charge(struct sock *sk, struct sk_filter *fp) { int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); u32 filter_size = bpf_prog_size(fp->prog->len); /* same check as in sock_kmalloc() */ if (filter_size <= optmem_max && atomic_read(&sk->sk_omem_alloc) + filter_size < optmem_max) { atomic_add(filter_size, &sk->sk_omem_alloc); return true; } return false; } bool sk_filter_charge(struct sock *sk, struct sk_filter *fp) { if (!refcount_inc_not_zero(&fp->refcnt)) return false; if (!__sk_filter_charge(sk, fp)) { sk_filter_release(fp); return false; } return true; } static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) { struct sock_filter *old_prog; struct bpf_prog *old_fp; int err, new_len, old_len = fp->len; bool seen_ld_abs = false; /* We are free to overwrite insns et al right here as it won't be used at * this point in time anymore internally after the migration to the eBPF * instruction representation. */ BUILD_BUG_ON(sizeof(struct sock_filter) != sizeof(struct bpf_insn)); /* Conversion cannot happen on overlapping memory areas, * so we need to keep the user BPF around until the 2nd * pass. At this time, the user BPF is stored in fp->insns. */ old_prog = kmemdup_array(fp->insns, old_len, sizeof(struct sock_filter), GFP_KERNEL | __GFP_NOWARN); if (!old_prog) { err = -ENOMEM; goto out_err; } /* 1st pass: calculate the new program length. */ err = bpf_convert_filter(old_prog, old_len, NULL, &new_len, &seen_ld_abs); if (err) goto out_err_free; /* Expand fp for appending the new filter representation. */ old_fp = fp; fp = bpf_prog_realloc(old_fp, bpf_prog_size(new_len), 0); if (!fp) { /* The old_fp is still around in case we couldn't * allocate new memory, so uncharge on that one. */ fp = old_fp; err = -ENOMEM; goto out_err_free; } fp->len = new_len; /* 2nd pass: remap sock_filter insns into bpf_insn insns. */ err = bpf_convert_filter(old_prog, old_len, fp, &new_len, &seen_ld_abs); if (err) /* 2nd bpf_convert_filter() can fail only if it fails * to allocate memory, remapping must succeed. Note, * that at this time old_fp has already been released * by krealloc(). */ goto out_err_free; fp = bpf_prog_select_runtime(fp, &err); if (err) goto out_err_free; kfree(old_prog); return fp; out_err_free: kfree(old_prog); out_err: __bpf_prog_release(fp); return ERR_PTR(err); } static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp, bpf_aux_classic_check_t trans) { int err; fp->bpf_func = NULL; fp->jited = 0; err = bpf_check_classic(fp->insns, fp->len); if (err) { __bpf_prog_release(fp); return ERR_PTR(err); } /* There might be additional checks and transformations * needed on classic filters, f.e. in case of seccomp. */ if (trans) { err = trans(fp->insns, fp->len); if (err) { __bpf_prog_release(fp); return ERR_PTR(err); } } /* Probe if we can JIT compile the filter and if so, do * the compilation of the filter. */ bpf_jit_compile(fp); /* JIT compiler couldn't process this filter, so do the eBPF translation * for the optimized interpreter. */ if (!fp->jited) fp = bpf_migrate_filter(fp); return fp; } /** * bpf_prog_create - create an unattached filter * @pfp: the unattached filter that is created * @fprog: the filter program * * Create a filter independent of any socket. We first run some * sanity checks on it to make sure it does not explode on us later. * If an error occurs or there is insufficient memory for the filter * a negative errno code is returned. On success the return is zero. */ int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog) { unsigned int fsize = bpf_classic_proglen(fprog); struct bpf_prog *fp; /* Make sure new filter is there and in the right amounts. */ if (!bpf_check_basics_ok(fprog->filter, fprog->len)) return -EINVAL; fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); if (!fp) return -ENOMEM; memcpy(fp->insns, fprog->filter, fsize); fp->len = fprog->len; /* Since unattached filters are not copied back to user * space through sk_get_filter(), we do not need to hold * a copy here, and can spare us the work. */ fp->orig_prog = NULL; /* bpf_prepare_filter() already takes care of freeing * memory in case something goes wrong. */ fp = bpf_prepare_filter(fp, NULL); if (IS_ERR(fp)) return PTR_ERR(fp); *pfp = fp; return 0; } EXPORT_SYMBOL_GPL(bpf_prog_create); /** * bpf_prog_create_from_user - create an unattached filter from user buffer * @pfp: the unattached filter that is created * @fprog: the filter program * @trans: post-classic verifier transformation handler * @save_orig: save classic BPF program * * This function effectively does the same as bpf_prog_create(), only * that it builds up its insns buffer from user space provided buffer. * It also allows for passing a bpf_aux_classic_check_t handler. */ int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog, bpf_aux_classic_check_t trans, bool save_orig) { unsigned int fsize = bpf_classic_proglen(fprog); struct bpf_prog *fp; int err; /* Make sure new filter is there and in the right amounts. */ if (!bpf_check_basics_ok(fprog->filter, fprog->len)) return -EINVAL; fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); if (!fp) return -ENOMEM; if (copy_from_user(fp->insns, fprog->filter, fsize)) { __bpf_prog_free(fp); return -EFAULT; } fp->len = fprog->len; fp->orig_prog = NULL; if (save_orig) { err = bpf_prog_store_orig_filter(fp, fprog); if (err) { __bpf_prog_free(fp); return -ENOMEM; } } /* bpf_prepare_filter() already takes care of freeing * memory in case something goes wrong. */ fp = bpf_prepare_filter(fp, trans); if (IS_ERR(fp)) return PTR_ERR(fp); *pfp = fp; return 0; } EXPORT_SYMBOL_GPL(bpf_prog_create_from_user); void bpf_prog_destroy(struct bpf_prog *fp) { __bpf_prog_release(fp); } EXPORT_SYMBOL_GPL(bpf_prog_destroy); static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk) { struct sk_filter *fp, *old_fp; fp = kmalloc(sizeof(*fp), GFP_KERNEL); if (!fp) return -ENOMEM; fp->prog = prog; if (!__sk_filter_charge(sk, fp)) { kfree(fp); return -ENOMEM; } refcount_set(&fp->refcnt, 1); old_fp = rcu_dereference_protected(sk->sk_filter, lockdep_sock_is_held(sk)); rcu_assign_pointer(sk->sk_filter, fp); if (old_fp) sk_filter_uncharge(sk, old_fp); return 0; } static struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk) { unsigned int fsize = bpf_classic_proglen(fprog); struct bpf_prog *prog; int err; if (sock_flag(sk, SOCK_FILTER_LOCKED)) return ERR_PTR(-EPERM); /* Make sure new filter is there and in the right amounts. */ if (!bpf_check_basics_ok(fprog->filter, fprog->len)) return ERR_PTR(-EINVAL); prog = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); if (!prog) return ERR_PTR(-ENOMEM); if (copy_from_user(prog->insns, fprog->filter, fsize)) { __bpf_prog_free(prog); return ERR_PTR(-EFAULT); } prog->len = fprog->len; err = bpf_prog_store_orig_filter(prog, fprog); if (err) { __bpf_prog_free(prog); return ERR_PTR(-ENOMEM); } /* bpf_prepare_filter() already takes care of freeing * memory in case something goes wrong. */ return bpf_prepare_filter(prog, NULL); } /** * sk_attach_filter - attach a socket filter * @fprog: the filter program * @sk: the socket to use * * Attach the user's filter code. We first run some sanity checks on * it to make sure it does not explode on us later. If an error * occurs or there is insufficient memory for the filter a negative * errno code is returned. On success the return is zero. */ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) { struct bpf_prog *prog = __get_filter(fprog, sk); int err; if (IS_ERR(prog)) return PTR_ERR(prog); err = __sk_attach_prog(prog, sk); if (err < 0) { __bpf_prog_release(prog); return err; } return 0; } EXPORT_SYMBOL_GPL(sk_attach_filter); int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk) { struct bpf_prog *prog = __get_filter(fprog, sk); int err, optmem_max; if (IS_ERR(prog)) return PTR_ERR(prog); optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); if (bpf_prog_size(prog->len) > optmem_max) err = -ENOMEM; else err = reuseport_attach_prog(sk, prog); if (err) __bpf_prog_release(prog); return err; } static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk) { if (sock_flag(sk, SOCK_FILTER_LOCKED)) return ERR_PTR(-EPERM); return bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER); } int sk_attach_bpf(u32 ufd, struct sock *sk) { struct bpf_prog *prog = __get_bpf(ufd, sk); int err; if (IS_ERR(prog)) return PTR_ERR(prog); err = __sk_attach_prog(prog, sk); if (err < 0) { bpf_prog_put(prog); return err; } return 0; } int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk) { struct bpf_prog *prog; int err, optmem_max; if (sock_flag(sk, SOCK_FILTER_LOCKED)) return -EPERM; prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER); if (PTR_ERR(prog) == -EINVAL) prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SK_REUSEPORT); if (IS_ERR(prog)) return PTR_ERR(prog); if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) { /* Like other non BPF_PROG_TYPE_SOCKET_FILTER * bpf prog (e.g. sockmap). It depends on the * limitation imposed by bpf_prog_load(). * Hence, sysctl_optmem_max is not checked. */ if ((sk->sk_type != SOCK_STREAM && sk->sk_type != SOCK_DGRAM) || (sk->sk_protocol != IPPROTO_UDP && sk->sk_protocol != IPPROTO_TCP) || (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)) { err = -ENOTSUPP; goto err_prog_put; } } else { /* BPF_PROG_TYPE_SOCKET_FILTER */ optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); if (bpf_prog_size(prog->len) > optmem_max) { err = -ENOMEM; goto err_prog_put; } } err = reuseport_attach_prog(sk, prog); err_prog_put: if (err) bpf_prog_put(prog); return err; } void sk_reuseport_prog_free(struct bpf_prog *prog) { if (!prog) return; if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) bpf_prog_put(prog); else bpf_prog_destroy(prog); } static inline int __bpf_try_make_writable(struct sk_buff *skb, unsigned int write_len) { #ifdef CONFIG_DEBUG_NET /* Avoid a splat in pskb_may_pull_reason() */ if (write_len > INT_MAX) return -EINVAL; #endif return skb_ensure_writable(skb, write_len); } static inline int bpf_try_make_writable(struct sk_buff *skb, unsigned int write_len) { int err = __bpf_try_make_writable(skb, write_len); bpf_compute_data_pointers(skb); return err; } static int bpf_try_make_head_writable(struct sk_buff *skb) { return bpf_try_make_writable(skb, skb_headlen(skb)); } static inline void bpf_push_mac_rcsum(struct sk_buff *skb) { if (skb_at_tc_ingress(skb)) skb_postpush_rcsum(skb, skb_mac_header(skb), skb->mac_len); } static inline void bpf_pull_mac_rcsum(struct sk_buff *skb) { if (skb_at_tc_ingress(skb)) skb_postpull_rcsum(skb, skb_mac_header(skb), skb->mac_len); } BPF_CALL_5(bpf_skb_store_bytes, struct sk_buff *, skb, u32, offset, const void *, from, u32, len, u64, flags) { void *ptr; if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH))) return -EINVAL; if (unlikely(offset > INT_MAX)) return -EFAULT; if (unlikely(bpf_try_make_writable(skb, offset + len))) return -EFAULT; ptr = skb->data + offset; if (flags & BPF_F_RECOMPUTE_CSUM) __skb_postpull_rcsum(skb, ptr, len, offset); memcpy(ptr, from, len); if (flags & BPF_F_RECOMPUTE_CSUM) __skb_postpush_rcsum(skb, ptr, len, offset); if (flags & BPF_F_INVALIDATE_HASH) skb_clear_hash(skb); return 0; } static const struct bpf_func_proto bpf_skb_store_bytes_proto = { .func = bpf_skb_store_bytes, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE, .arg5_type = ARG_ANYTHING, }; int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags) { return ____bpf_skb_store_bytes(skb, offset, from, len, flags); } BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset, void *, to, u32, len) { void *ptr; if (unlikely(offset > INT_MAX)) goto err_clear; ptr = skb_header_pointer(skb, offset, len, to); if (unlikely(!ptr)) goto err_clear; if (ptr != to) memcpy(to, ptr, len); return 0; err_clear: memset(to, 0, len); return -EFAULT; } static const struct bpf_func_proto bpf_skb_load_bytes_proto = { .func = bpf_skb_load_bytes, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, }; int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len) { return ____bpf_skb_load_bytes(skb, offset, to, len); } BPF_CALL_4(bpf_flow_dissector_load_bytes, const struct bpf_flow_dissector *, ctx, u32, offset, void *, to, u32, len) { void *ptr; if (unlikely(offset > 0xffff)) goto err_clear; if (unlikely(!ctx->skb)) goto err_clear; ptr = skb_header_pointer(ctx->skb, offset, len, to); if (unlikely(!ptr)) goto err_clear; if (ptr != to) memcpy(to, ptr, len); return 0; err_clear: memset(to, 0, len); return -EFAULT; } static const struct bpf_func_proto bpf_flow_dissector_load_bytes_proto = { .func = bpf_flow_dissector_load_bytes, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, }; BPF_CALL_5(bpf_skb_load_bytes_relative, const struct sk_buff *, skb, u32, offset, void *, to, u32, len, u32, start_header) { u8 *end = skb_tail_pointer(skb); u8 *start, *ptr; if (unlikely(offset > 0xffff)) goto err_clear; switch (start_header) { case BPF_HDR_START_MAC: if (unlikely(!skb_mac_header_was_set(skb))) goto err_clear; start = skb_mac_header(skb); break; case BPF_HDR_START_NET: start = skb_network_header(skb); break; default: goto err_clear; } ptr = start + offset; if (likely(ptr + len <= end)) { memcpy(to, ptr, len); return 0; } err_clear: memset(to, 0, len); return -EFAULT; } static const struct bpf_func_proto bpf_skb_load_bytes_relative_proto = { .func = bpf_skb_load_bytes_relative, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, .arg5_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len) { /* Idea is the following: should the needed direct read/write * test fail during runtime, we can pull in more data and redo * again, since implicitly, we invalidate previous checks here. * * Or, since we know how much we need to make read/writeable, * this can be done once at the program beginning for direct * access case. By this we overcome limitations of only current * headroom being accessible. */ return bpf_try_make_writable(skb, len ? : skb_headlen(skb)); } static const struct bpf_func_proto bpf_skb_pull_data_proto = { .func = bpf_skb_pull_data, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; BPF_CALL_1(bpf_sk_fullsock, struct sock *, sk) { return sk_fullsock(sk) ? (unsigned long)sk : (unsigned long)NULL; } static const struct bpf_func_proto bpf_sk_fullsock_proto = { .func = bpf_sk_fullsock, .gpl_only = false, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_SOCK_COMMON, }; static inline int sk_skb_try_make_writable(struct sk_buff *skb, unsigned int write_len) { return __bpf_try_make_writable(skb, write_len); } BPF_CALL_2(sk_skb_pull_data, struct sk_buff *, skb, u32, len) { /* Idea is the following: should the needed direct read/write * test fail during runtime, we can pull in more data and redo * again, since implicitly, we invalidate previous checks here. * * Or, since we know how much we need to make read/writeable, * this can be done once at the program beginning for direct * access case. By this we overcome limitations of only current * headroom being accessible. */ return sk_skb_try_make_writable(skb, len ? : skb_headlen(skb)); } static const struct bpf_func_proto sk_skb_pull_data_proto = { .func = sk_skb_pull_data, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset, u64, from, u64, to, u64, flags) { __sum16 *ptr; if (unlikely(flags & ~(BPF_F_HDR_FIELD_MASK))) return -EINVAL; if (unlikely(offset > 0xffff || offset & 1)) return -EFAULT; if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr)))) return -EFAULT; ptr = (__sum16 *)(skb->data + offset); switch (flags & BPF_F_HDR_FIELD_MASK) { case 0: if (unlikely(from != 0)) return -EINVAL; csum_replace_by_diff(ptr, to); break; case 2: csum_replace2(ptr, from, to); break; case 4: csum_replace4(ptr, from, to); break; default: return -EINVAL; } return 0; } static const struct bpf_func_proto bpf_l3_csum_replace_proto = { .func = bpf_l3_csum_replace, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset, u64, from, u64, to, u64, flags) { bool is_pseudo = flags & BPF_F_PSEUDO_HDR; bool is_mmzero = flags & BPF_F_MARK_MANGLED_0; bool do_mforce = flags & BPF_F_MARK_ENFORCE; bool is_ipv6 = flags & BPF_F_IPV6; __sum16 *ptr; if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_MARK_ENFORCE | BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK | BPF_F_IPV6))) return -EINVAL; if (unlikely(offset > 0xffff || offset & 1)) return -EFAULT; if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr)))) return -EFAULT; ptr = (__sum16 *)(skb->data + offset); if (is_mmzero && !do_mforce && !*ptr) return 0; switch (flags & BPF_F_HDR_FIELD_MASK) { case 0: if (unlikely(from != 0)) return -EINVAL; inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo, is_ipv6); break; case 2: inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo); break; case 4: inet_proto_csum_replace4(ptr, skb, from, to, is_pseudo); break; default: return -EINVAL; } if (is_mmzero && !*ptr) *ptr = CSUM_MANGLED_0; return 0; } static const struct bpf_func_proto bpf_l4_csum_replace_proto = { .func = bpf_l4_csum_replace, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size, __be32 *, to, u32, to_size, __wsum, seed) { /* This is quite flexible, some examples: * * from_size == 0, to_size > 0, seed := csum --> pushing data * from_size > 0, to_size == 0, seed := csum --> pulling data * from_size > 0, to_size > 0, seed := 0 --> diffing data * * Even for diffing, from_size and to_size don't need to be equal. */ __wsum ret = seed; if (from_size && to_size) ret = csum_sub(csum_partial(to, to_size, ret), csum_partial(from, from_size, 0)); else if (to_size) ret = csum_partial(to, to_size, ret); else if (from_size) ret = ~csum_partial(from, from_size, ~ret); return csum_from32to16((__force unsigned int)ret); } static const struct bpf_func_proto bpf_csum_diff_proto = { .func = bpf_csum_diff, .gpl_only = false, .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE_OR_ZERO, .arg5_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_csum_update, struct sk_buff *, skb, __wsum, csum) { /* The interface is to be used in combination with bpf_csum_diff() * for direct packet writes. csum rotation for alignment as well * as emulating csum_sub() can be done from the eBPF program. */ if (skb->ip_summed == CHECKSUM_COMPLETE) return (skb->csum = csum_add(skb->csum, csum)); return -ENOTSUPP; } static const struct bpf_func_proto bpf_csum_update_proto = { .func = bpf_csum_update, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_csum_level, struct sk_buff *, skb, u64, level) { /* The interface is to be used in combination with bpf_skb_adjust_room() * for encap/decap of packet headers when BPF_F_ADJ_ROOM_NO_CSUM_RESET * is passed as flags, for example. */ switch (level) { case BPF_CSUM_LEVEL_INC: __skb_incr_checksum_unnecessary(skb); break; case BPF_CSUM_LEVEL_DEC: __skb_decr_checksum_unnecessary(skb); break; case BPF_CSUM_LEVEL_RESET: __skb_reset_checksum_unnecessary(skb); break; case BPF_CSUM_LEVEL_QUERY: return skb->ip_summed == CHECKSUM_UNNECESSARY ? skb->csum_level : -EACCES; default: return -EINVAL; } return 0; } static const struct bpf_func_proto bpf_csum_level_proto = { .func = bpf_csum_level, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb) { return dev_forward_skb_nomtu(dev, skb); } static inline int __bpf_rx_skb_no_mac(struct net_device *dev, struct sk_buff *skb) { int ret = ____dev_forward_skb(dev, skb, false); if (likely(!ret)) { skb->dev = dev; ret = netif_rx(skb); } return ret; } static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb) { int ret; if (dev_xmit_recursion()) { net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n"); kfree_skb(skb); return -ENETDOWN; } skb->dev = dev; skb_set_redirected_noclear(skb, skb_at_tc_ingress(skb)); skb_clear_tstamp(skb); dev_xmit_recursion_inc(); ret = dev_queue_xmit(skb); dev_xmit_recursion_dec(); return ret; } static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev, u32 flags) { unsigned int mlen = skb_network_offset(skb); if (unlikely(skb->len <= mlen)) { kfree_skb(skb); return -ERANGE; } if (mlen) { __skb_pull(skb, mlen); /* At ingress, the mac header has already been pulled once. * At egress, skb_pospull_rcsum has to be done in case that * the skb is originated from ingress (i.e. a forwarded skb) * to ensure that rcsum starts at net header. */ if (!skb_at_tc_ingress(skb)) skb_postpull_rcsum(skb, skb_mac_header(skb), mlen); } skb_pop_mac_header(skb); skb_reset_mac_len(skb); return flags & BPF_F_INGRESS ? __bpf_rx_skb_no_mac(dev, skb) : __bpf_tx_skb(dev, skb); } static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev, u32 flags) { /* Verify that a link layer header is carried */ if (unlikely(skb->mac_header >= skb->network_header || skb->len == 0)) { kfree_skb(skb); return -ERANGE; } bpf_push_mac_rcsum(skb); return flags & BPF_F_INGRESS ? __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb); } static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev, u32 flags) { if (dev_is_mac_header_xmit(dev)) return __bpf_redirect_common(skb, dev, flags); else return __bpf_redirect_no_mac(skb, dev, flags); } #if IS_ENABLED(CONFIG_IPV6) static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb, struct net_device *dev, struct bpf_nh_params *nh) { u32 hh_len = LL_RESERVED_SPACE(dev); const struct in6_addr *nexthop; struct dst_entry *dst = NULL; struct neighbour *neigh; if (dev_xmit_recursion()) { net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n"); goto out_drop; } skb->dev = dev; skb_clear_tstamp(skb); if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { skb = skb_expand_head(skb, hh_len); if (!skb) return -ENOMEM; } rcu_read_lock(); if (!nh) { dst = skb_dst(skb); nexthop = rt6_nexthop(dst_rt6_info(dst), &ipv6_hdr(skb)->daddr); } else { nexthop = &nh->ipv6_nh; } neigh = ip_neigh_gw6(dev, nexthop); if (likely(!IS_ERR(neigh))) { int ret; sock_confirm_neigh(skb, neigh); local_bh_disable(); dev_xmit_recursion_inc(); ret = neigh_output(neigh, skb, false); dev_xmit_recursion_dec(); local_bh_enable(); rcu_read_unlock(); return ret; } rcu_read_unlock(); if (dst) IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); out_drop: kfree_skb(skb); return -ENETDOWN; } static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev, struct bpf_nh_params *nh) { const struct ipv6hdr *ip6h = ipv6_hdr(skb); struct net *net = dev_net(dev); int err, ret = NET_XMIT_DROP; if (!nh) { struct dst_entry *dst; struct flowi6 fl6 = { .flowi6_flags = FLOWI_FLAG_ANYSRC, .flowi6_mark = skb->mark, .flowlabel = ip6_flowinfo(ip6h), .flowi6_oif = dev->ifindex, .flowi6_proto = ip6h->nexthdr, .daddr = ip6h->daddr, .saddr = ip6h->saddr, }; dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL); if (IS_ERR(dst)) goto out_drop; skb_dst_drop(skb); skb_dst_set(skb, dst); } else if (nh->nh_family != AF_INET6) { goto out_drop; } err = bpf_out_neigh_v6(net, skb, dev, nh); if (unlikely(net_xmit_eval(err))) DEV_STATS_INC(dev, tx_errors); else ret = NET_XMIT_SUCCESS; goto out_xmit; out_drop: DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); out_xmit: return ret; } #else static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev, struct bpf_nh_params *nh) { kfree_skb(skb); return NET_XMIT_DROP; } #endif /* CONFIG_IPV6 */ #if IS_ENABLED(CONFIG_INET) static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb, struct net_device *dev, struct bpf_nh_params *nh) { u32 hh_len = LL_RESERVED_SPACE(dev); struct neighbour *neigh; bool is_v6gw = false; if (dev_xmit_recursion()) { net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n"); goto out_drop; } skb->dev = dev; skb_clear_tstamp(skb); if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { skb = skb_expand_head(skb, hh_len); if (!skb) return -ENOMEM; } rcu_read_lock(); if (!nh) { struct rtable *rt = skb_rtable(skb); neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); } else if (nh->nh_family == AF_INET6) { neigh = ip_neigh_gw6(dev, &nh->ipv6_nh); is_v6gw = true; } else if (nh->nh_family == AF_INET) { neigh = ip_neigh_gw4(dev, nh->ipv4_nh); } else { rcu_read_unlock(); goto out_drop; } if (likely(!IS_ERR(neigh))) { int ret; sock_confirm_neigh(skb, neigh); local_bh_disable(); dev_xmit_recursion_inc(); ret = neigh_output(neigh, skb, is_v6gw); dev_xmit_recursion_dec(); local_bh_enable(); rcu_read_unlock(); return ret; } rcu_read_unlock(); out_drop: kfree_skb(skb); return -ENETDOWN; } static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev, struct bpf_nh_params *nh) { const struct iphdr *ip4h = ip_hdr(skb); struct net *net = dev_net(dev); int err, ret = NET_XMIT_DROP; if (!nh) { struct flowi4 fl4 = { .flowi4_flags = FLOWI_FLAG_ANYSRC, .flowi4_mark = skb->mark, .flowi4_dscp = ip4h_dscp(ip4h), .flowi4_oif = dev->ifindex, .flowi4_proto = ip4h->protocol, .daddr = ip4h->daddr, .saddr = ip4h->saddr, }; struct rtable *rt; rt = ip_route_output_flow(net, &fl4, NULL); if (IS_ERR(rt)) goto out_drop; if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) { ip_rt_put(rt); goto out_drop; } skb_dst_drop(skb); skb_dst_set(skb, &rt->dst); } err = bpf_out_neigh_v4(net, skb, dev, nh); if (unlikely(net_xmit_eval(err))) DEV_STATS_INC(dev, tx_errors); else ret = NET_XMIT_SUCCESS; goto out_xmit; out_drop: DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); out_xmit: return ret; } #else static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev, struct bpf_nh_params *nh) { kfree_skb(skb); return NET_XMIT_DROP; } #endif /* CONFIG_INET */ static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev, struct bpf_nh_params *nh) { struct ethhdr *ethh = eth_hdr(skb); if (unlikely(skb->mac_header >= skb->network_header)) goto out; bpf_push_mac_rcsum(skb); if (is_multicast_ether_addr(ethh->h_dest)) goto out; skb_pull(skb, sizeof(*ethh)); skb_unset_mac_header(skb); skb_reset_network_header(skb); if (skb->protocol == htons(ETH_P_IP)) return __bpf_redirect_neigh_v4(skb, dev, nh); else if (skb->protocol == htons(ETH_P_IPV6)) return __bpf_redirect_neigh_v6(skb, dev, nh); out: kfree_skb(skb); return -ENOTSUPP; } /* Internal, non-exposed redirect flags. */ enum { BPF_F_NEIGH = (1ULL << 16), BPF_F_PEER = (1ULL << 17), BPF_F_NEXTHOP = (1ULL << 18), #define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH | BPF_F_PEER | BPF_F_NEXTHOP) }; BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) { struct net_device *dev; struct sk_buff *clone; int ret; BUILD_BUG_ON(BPF_F_REDIRECT_INTERNAL & BPF_F_REDIRECT_FLAGS); if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL))) return -EINVAL; /* BPF test infra's convert___skb_to_skb() can create type-less * GSO packets. gso_features_check() will detect this as a bad * offload. However, lets not leak them out in the first place. */ if (unlikely(skb_is_gso(skb) && !skb_shinfo(skb)->gso_type)) return -EBADMSG; dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex); if (unlikely(!dev)) return -EINVAL; clone = skb_clone(skb, GFP_ATOMIC); if (unlikely(!clone)) return -ENOMEM; /* For direct write, we need to keep the invariant that the skbs * we're dealing with need to be uncloned. Should uncloning fail * here, we need to free the just generated clone to unclone once * again. */ ret = bpf_try_make_head_writable(skb); if (unlikely(ret)) { kfree_skb(clone); return -ENOMEM; } return __bpf_redirect(clone, dev, flags); } static const struct bpf_func_proto bpf_clone_redirect_proto = { .func = bpf_clone_redirect, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; static struct net_device *skb_get_peer_dev(struct net_device *dev) { const struct net_device_ops *ops = dev->netdev_ops; if (likely(ops->ndo_get_peer_dev)) return INDIRECT_CALL_1(ops->ndo_get_peer_dev, netkit_peer_dev, dev); return NULL; } int skb_do_redirect(struct sk_buff *skb) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); struct net *net = dev_net(skb->dev); struct net_device *dev; u32 flags = ri->flags; dev = dev_get_by_index_rcu(net, ri->tgt_index); ri->tgt_index = 0; ri->flags = 0; if (unlikely(!dev)) goto out_drop; if (flags & BPF_F_PEER) { if (unlikely(!skb_at_tc_ingress(skb))) goto out_drop; dev = skb_get_peer_dev(dev); if (unlikely(!dev || !(dev->flags & IFF_UP) || net_eq(net, dev_net(dev)))) goto out_drop; skb->dev = dev; dev_sw_netstats_rx_add(dev, skb->len); skb_scrub_packet(skb, false); return -EAGAIN; } return flags & BPF_F_NEIGH ? __bpf_redirect_neigh(skb, dev, flags & BPF_F_NEXTHOP ? &ri->nh : NULL) : __bpf_redirect(skb, dev, flags); out_drop: kfree_skb(skb); return -EINVAL; } BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL))) return TC_ACT_SHOT; ri->flags = flags; ri->tgt_index = ifindex; return TC_ACT_REDIRECT; } static const struct bpf_func_proto bpf_redirect_proto = { .func = bpf_redirect, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, .arg2_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_redirect_peer, u32, ifindex, u64, flags) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); if (unlikely(flags)) return TC_ACT_SHOT; ri->flags = BPF_F_PEER; ri->tgt_index = ifindex; return TC_ACT_REDIRECT; } static const struct bpf_func_proto bpf_redirect_peer_proto = { .func = bpf_redirect_peer, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, .arg2_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_redirect_neigh, u32, ifindex, struct bpf_redir_neigh *, params, int, plen, u64, flags) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); if (unlikely((plen && plen < sizeof(*params)) || flags)) return TC_ACT_SHOT; ri->flags = BPF_F_NEIGH | (plen ? BPF_F_NEXTHOP : 0); ri->tgt_index = ifindex; BUILD_BUG_ON(sizeof(struct bpf_redir_neigh) != sizeof(struct bpf_nh_params)); if (plen) memcpy(&ri->nh, params, sizeof(ri->nh)); return TC_ACT_REDIRECT; } static const struct bpf_func_proto bpf_redirect_neigh_proto = { .func = bpf_redirect_neigh, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, .arg2_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg *, msg, u32, bytes) { msg->apply_bytes = bytes; return 0; } static const struct bpf_func_proto bpf_msg_apply_bytes_proto = { .func = bpf_msg_apply_bytes, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg *, msg, u32, bytes) { msg->cork_bytes = bytes; return 0; } static void sk_msg_reset_curr(struct sk_msg *msg) { if (!msg->sg.size) { msg->sg.curr = msg->sg.start; msg->sg.copybreak = 0; } else { u32 i = msg->sg.end; sk_msg_iter_var_prev(i); msg->sg.curr = i; msg->sg.copybreak = msg->sg.data[i].length; } } static const struct bpf_func_proto bpf_msg_cork_bytes_proto = { .func = bpf_msg_cork_bytes, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start, u32, end, u64, flags) { u32 len = 0, offset = 0, copy = 0, poffset = 0, bytes = end - start; u32 first_sge, last_sge, i, shift, bytes_sg_total; struct scatterlist *sge; u8 *raw, *to, *from; struct page *page; if (unlikely(flags || end <= start)) return -EINVAL; /* First find the starting scatterlist element */ i = msg->sg.start; do { offset += len; len = sk_msg_elem(msg, i)->length; if (start < offset + len) break; sk_msg_iter_var_next(i); } while (i != msg->sg.end); if (unlikely(start >= offset + len)) return -EINVAL; first_sge = i; /* The start may point into the sg element so we need to also * account for the headroom. */ bytes_sg_total = start - offset + bytes; if (!test_bit(i, msg->sg.copy) && bytes_sg_total <= len) goto out; /* At this point we need to linearize multiple scatterlist * elements or a single shared page. Either way we need to * copy into a linear buffer exclusively owned by BPF. Then * place the buffer in the scatterlist and fixup the original * entries by removing the entries now in the linear buffer * and shifting the remaining entries. For now we do not try * to copy partial entries to avoid complexity of running out * of sg_entry slots. The downside is reading a single byte * will copy the entire sg entry. */ do { copy += sk_msg_elem(msg, i)->length; sk_msg_iter_var_next(i); if (bytes_sg_total <= copy) break; } while (i != msg->sg.end); last_sge = i; if (unlikely(bytes_sg_total > copy)) return -EINVAL; page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP, get_order(copy)); if (unlikely(!page)) return -ENOMEM; raw = page_address(page); i = first_sge; do { sge = sk_msg_elem(msg, i); from = sg_virt(sge); len = sge->length; to = raw + poffset; memcpy(to, from, len); poffset += len; sge->length = 0; put_page(sg_page(sge)); sk_msg_iter_var_next(i); } while (i != last_sge); sg_set_page(&msg->sg.data[first_sge], page, copy, 0); /* To repair sg ring we need to shift entries. If we only * had a single entry though we can just replace it and * be done. Otherwise walk the ring and shift the entries. */ WARN_ON_ONCE(last_sge == first_sge); shift = last_sge > first_sge ? last_sge - first_sge - 1 : NR_MSG_FRAG_IDS - first_sge + last_sge - 1; if (!shift) goto out; i = first_sge; sk_msg_iter_var_next(i); do { u32 move_from; if (i + shift >= NR_MSG_FRAG_IDS) move_from = i + shift - NR_MSG_FRAG_IDS; else move_from = i + shift; if (move_from == msg->sg.end) break; msg->sg.data[i] = msg->sg.data[move_from]; msg->sg.data[move_from].length = 0; msg->sg.data[move_from].page_link = 0; msg->sg.data[move_from].offset = 0; sk_msg_iter_var_next(i); } while (1); msg->sg.end = msg->sg.end - shift > msg->sg.end ? msg->sg.end - shift + NR_MSG_FRAG_IDS : msg->sg.end - shift; out: sk_msg_reset_curr(msg); msg->data = sg_virt(&msg->sg.data[first_sge]) + start - offset; msg->data_end = msg->data + bytes; return 0; } static const struct bpf_func_proto bpf_msg_pull_data_proto = { .func = bpf_msg_pull_data, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, u32, len, u64, flags) { struct scatterlist sge, nsge, nnsge, rsge = {0}, *psge; u32 new, i = 0, l = 0, space, copy = 0, offset = 0; u8 *raw, *to, *from; struct page *page; if (unlikely(flags)) return -EINVAL; if (unlikely(len == 0)) return 0; /* First find the starting scatterlist element */ i = msg->sg.start; do { offset += l; l = sk_msg_elem(msg, i)->length; if (start < offset + l) break; sk_msg_iter_var_next(i); } while (i != msg->sg.end); if (start > offset + l) return -EINVAL; space = MAX_MSG_FRAGS - sk_msg_elem_used(msg); /* If no space available will fallback to copy, we need at * least one scatterlist elem available to push data into * when start aligns to the beginning of an element or two * when it falls inside an element. We handle the start equals * offset case because its the common case for inserting a * header. */ if (!space || (space == 1 && start != offset)) copy = msg->sg.data[i].length; page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP, get_order(copy + len)); if (unlikely(!page)) return -ENOMEM; if (copy) { int front, back; raw = page_address(page); if (i == msg->sg.end) sk_msg_iter_var_prev(i); psge = sk_msg_elem(msg, i); front = start - offset; back = psge->length - front; from = sg_virt(psge); if (front) memcpy(raw, from, front); if (back) { from += front; to = raw + front + len; memcpy(to, from, back); } put_page(sg_page(psge)); new = i; goto place_new; } if (start - offset) { if (i == msg->sg.end) sk_msg_iter_var_prev(i); psge = sk_msg_elem(msg, i); rsge = sk_msg_elem_cpy(msg, i); psge->length = start - offset; rsge.length -= psge->length; rsge.offset += start; sk_msg_iter_var_next(i); sg_unmark_end(psge); sg_unmark_end(&rsge); } /* Slot(s) to place newly allocated data */ sk_msg_iter_next(msg, end); new = i; sk_msg_iter_var_next(i); if (i == msg->sg.end) { if (!rsge.length) goto place_new; sk_msg_iter_next(msg, end); goto place_new; } /* Shift one or two slots as needed */ sge = sk_msg_elem_cpy(msg, new); sg_unmark_end(&sge); nsge = sk_msg_elem_cpy(msg, i); if (rsge.length) { sk_msg_iter_var_next(i); nnsge = sk_msg_elem_cpy(msg, i); sk_msg_iter_next(msg, end); } while (i != msg->sg.end) { msg->sg.data[i] = sge; sge = nsge; sk_msg_iter_var_next(i); if (rsge.length) { nsge = nnsge; nnsge = sk_msg_elem_cpy(msg, i); } else { nsge = sk_msg_elem_cpy(msg, i); } } place_new: /* Place newly allocated data buffer */ sk_mem_charge(msg->sk, len); msg->sg.size += len; __clear_bit(new, msg->sg.copy); sg_set_page(&msg->sg.data[new], page, len + copy, 0); if (rsge.length) { get_page(sg_page(&rsge)); sk_msg_iter_var_next(new); msg->sg.data[new] = rsge; } sk_msg_reset_curr(msg); sk_msg_compute_data_pointers(msg); return 0; } static const struct bpf_func_proto bpf_msg_push_data_proto = { .func = bpf_msg_push_data, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, }; static void sk_msg_shift_left(struct sk_msg *msg, int i) { struct scatterlist *sge = sk_msg_elem(msg, i); int prev; put_page(sg_page(sge)); do { prev = i; sk_msg_iter_var_next(i); msg->sg.data[prev] = msg->sg.data[i]; } while (i != msg->sg.end); sk_msg_iter_prev(msg, end); } static void sk_msg_shift_right(struct sk_msg *msg, int i) { struct scatterlist tmp, sge; sk_msg_iter_next(msg, end); sge = sk_msg_elem_cpy(msg, i); sk_msg_iter_var_next(i); tmp = sk_msg_elem_cpy(msg, i); while (i != msg->sg.end) { msg->sg.data[i] = sge; sk_msg_iter_var_next(i); sge = tmp; tmp = sk_msg_elem_cpy(msg, i); } } BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, u32, len, u64, flags) { u32 i = 0, l = 0, space, offset = 0; u64 last = start + len; int pop; if (unlikely(flags)) return -EINVAL; if (unlikely(len == 0)) return 0; /* First find the starting scatterlist element */ i = msg->sg.start; do { offset += l; l = sk_msg_elem(msg, i)->length; if (start < offset + l) break; sk_msg_iter_var_next(i); } while (i != msg->sg.end); /* Bounds checks: start and pop must be inside message */ if (start >= offset + l || last > msg->sg.size) return -EINVAL; space = MAX_MSG_FRAGS - sk_msg_elem_used(msg); pop = len; /* --------------| offset * -| start |-------- len -------| * * |----- a ----|-------- pop -------|----- b ----| * |______________________________________________| length * * * a: region at front of scatter element to save * b: region at back of scatter element to save when length > A + pop * pop: region to pop from element, same as input 'pop' here will be * decremented below per iteration. * * Two top-level cases to handle when start != offset, first B is non * zero and second B is zero corresponding to when a pop includes more * than one element. * * Then if B is non-zero AND there is no space allocate space and * compact A, B regions into page. If there is space shift ring to * the right free'ing the next element in ring to place B, leaving * A untouched except to reduce length. */ if (start != offset) { struct scatterlist *nsge, *sge = sk_msg_elem(msg, i); int a = start - offset; int b = sge->length - pop - a; sk_msg_iter_var_next(i); if (b > 0) { if (space) { sge->length = a; sk_msg_shift_right(msg, i); nsge = sk_msg_elem(msg, i); get_page(sg_page(sge)); sg_set_page(nsge, sg_page(sge), b, sge->offset + pop + a); } else { struct page *page, *orig; u8 *to, *from; page = alloc_pages(__GFP_NOWARN | __GFP_COMP | GFP_ATOMIC, get_order(a + b)); if (unlikely(!page)) return -ENOMEM; orig = sg_page(sge); from = sg_virt(sge); to = page_address(page); memcpy(to, from, a); memcpy(to + a, from + a + pop, b); sg_set_page(sge, page, a + b, 0); put_page(orig); } pop = 0; } else { pop -= (sge->length - a); sge->length = a; } } /* From above the current layout _must_ be as follows, * * -| offset * -| start * * |---- pop ---|---------------- b ------------| * |____________________________________________| length * * Offset and start of the current msg elem are equal because in the * previous case we handled offset != start and either consumed the * entire element and advanced to the next element OR pop == 0. * * Two cases to handle here are first pop is less than the length * leaving some remainder b above. Simply adjust the element's layout * in this case. Or pop >= length of the element so that b = 0. In this * case advance to next element decrementing pop. */ while (pop) { struct scatterlist *sge = sk_msg_elem(msg, i); if (pop < sge->length) { sge->length -= pop; sge->offset += pop; pop = 0; } else { pop -= sge->length; sk_msg_shift_left(msg, i); } } sk_mem_uncharge(msg->sk, len - pop); msg->sg.size -= (len - pop); sk_msg_reset_curr(msg); sk_msg_compute_data_pointers(msg); return 0; } static const struct bpf_func_proto bpf_msg_pop_data_proto = { .func = bpf_msg_pop_data, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, }; #ifdef CONFIG_CGROUP_NET_CLASSID BPF_CALL_0(bpf_get_cgroup_classid_curr) { return __task_get_classid(current); } const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto = { .func = bpf_get_cgroup_classid_curr, .gpl_only = false, .ret_type = RET_INTEGER, }; BPF_CALL_1(bpf_skb_cgroup_classid, const struct sk_buff *, skb) { struct sock *sk = skb_to_full_sk(skb); if (!sk || !sk_fullsock(sk)) return 0; return sock_cgroup_classid(&sk->sk_cgrp_data); } static const struct bpf_func_proto bpf_skb_cgroup_classid_proto = { .func = bpf_skb_cgroup_classid, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; #endif BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) { return task_get_classid(skb); } static const struct bpf_func_proto bpf_get_cgroup_classid_proto = { .func = bpf_get_cgroup_classid, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_1(bpf_get_route_realm, const struct sk_buff *, skb) { return dst_tclassid(skb); } static const struct bpf_func_proto bpf_get_route_realm_proto = { .func = bpf_get_route_realm, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_1(bpf_get_hash_recalc, struct sk_buff *, skb) { /* If skb_clear_hash() was called due to mangling, we can * trigger SW recalculation here. Later access to hash * can then use the inline skb->hash via context directly * instead of calling this helper again. */ return skb_get_hash(skb); } static const struct bpf_func_proto bpf_get_hash_recalc_proto = { .func = bpf_get_hash_recalc, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_1(bpf_set_hash_invalid, struct sk_buff *, skb) { /* After all direct packet write, this can be used once for * triggering a lazy recalc on next skb_get_hash() invocation. */ skb_clear_hash(skb); return 0; } static const struct bpf_func_proto bpf_set_hash_invalid_proto = { .func = bpf_set_hash_invalid, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_2(bpf_set_hash, struct sk_buff *, skb, u32, hash) { /* Set user specified hash as L4(+), so that it gets returned * on skb_get_hash() call unless BPF prog later on triggers a * skb_clear_hash(). */ __skb_set_sw_hash(skb, hash, true); return 0; } static const struct bpf_func_proto bpf_set_hash_proto = { .func = bpf_set_hash, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto, u16, vlan_tci) { int ret; if (unlikely(vlan_proto != htons(ETH_P_8021Q) && vlan_proto != htons(ETH_P_8021AD))) vlan_proto = htons(ETH_P_8021Q); bpf_push_mac_rcsum(skb); ret = skb_vlan_push(skb, vlan_proto, vlan_tci); bpf_pull_mac_rcsum(skb); skb_reset_mac_len(skb); bpf_compute_data_pointers(skb); return ret; } static const struct bpf_func_proto bpf_skb_vlan_push_proto = { .func = bpf_skb_vlan_push, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb) { int ret; bpf_push_mac_rcsum(skb); ret = skb_vlan_pop(skb); bpf_pull_mac_rcsum(skb); bpf_compute_data_pointers(skb); return ret; } static const struct bpf_func_proto bpf_skb_vlan_pop_proto = { .func = bpf_skb_vlan_pop, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; static void bpf_skb_change_protocol(struct sk_buff *skb, u16 proto) { skb->protocol = htons(proto); if (skb_valid_dst(skb)) skb_dst_drop(skb); } static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len) { /* Caller already did skb_cow() with meta_len+len as headroom, * so no need to do it here. */ skb_push(skb, len); skb_postpush_data_move(skb, len, off); memset(skb->data + off, 0, len); /* No skb_postpush_rcsum(skb, skb->data + off, len) * needed here as it does not change the skb->csum * result for checksum complete when summing over * zeroed blocks. */ return 0; } static int bpf_skb_generic_pop(struct sk_buff *skb, u32 off, u32 len) { void *old_data; /* skb_ensure_writable() is not needed here, as we're * already working on an uncloned skb. */ if (unlikely(!pskb_may_pull(skb, off + len))) return -ENOMEM; old_data = skb->data; __skb_pull(skb, len); skb_postpull_rcsum(skb, old_data + off, len); skb_postpull_data_move(skb, len, off); return 0; } static int bpf_skb_net_hdr_push(struct sk_buff *skb, u32 off, u32 len) { bool trans_same = skb->transport_header == skb->network_header; int ret; /* There's no need for __skb_push()/__skb_pull() pair to * get to the start of the mac header as we're guaranteed * to always start from here under eBPF. */ ret = bpf_skb_generic_push(skb, off, len); if (likely(!ret)) { skb->mac_header -= len; skb->network_header -= len; if (trans_same) skb->transport_header = skb->network_header; } return ret; } static int bpf_skb_net_hdr_pop(struct sk_buff *skb, u32 off, u32 len) { bool trans_same = skb->transport_header == skb->network_header; int ret; /* Same here, __skb_push()/__skb_pull() pair not needed. */ ret = bpf_skb_generic_pop(skb, off, len); if (likely(!ret)) { skb->mac_header += len; skb->network_header += len; if (trans_same) skb->transport_header = skb->network_header; } return ret; } static int bpf_skb_proto_4_to_6(struct sk_buff *skb) { const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); const u8 meta_len = skb_metadata_len(skb); u32 off = skb_mac_header_len(skb); int ret; ret = skb_cow(skb, meta_len + len_diff); if (unlikely(ret < 0)) return ret; ret = bpf_skb_net_hdr_push(skb, off, len_diff); if (unlikely(ret < 0)) return ret; if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); /* SKB_GSO_TCPV4 needs to be changed into SKB_GSO_TCPV6. */ if (shinfo->gso_type & SKB_GSO_TCPV4) { shinfo->gso_type &= ~SKB_GSO_TCPV4; shinfo->gso_type |= SKB_GSO_TCPV6; } } bpf_skb_change_protocol(skb, ETH_P_IPV6); skb_clear_hash(skb); return 0; } static int bpf_skb_proto_6_to_4(struct sk_buff *skb) { const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); u32 off = skb_mac_header_len(skb); int ret; ret = skb_unclone(skb, GFP_ATOMIC); if (unlikely(ret < 0)) return ret; ret = bpf_skb_net_hdr_pop(skb, off, len_diff); if (unlikely(ret < 0)) return ret; if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); /* SKB_GSO_TCPV6 needs to be changed into SKB_GSO_TCPV4. */ if (shinfo->gso_type & SKB_GSO_TCPV6) { shinfo->gso_type &= ~SKB_GSO_TCPV6; shinfo->gso_type |= SKB_GSO_TCPV4; } } bpf_skb_change_protocol(skb, ETH_P_IP); skb_clear_hash(skb); return 0; } static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto) { __be16 from_proto = skb->protocol; if (from_proto == htons(ETH_P_IP) && to_proto == htons(ETH_P_IPV6)) return bpf_skb_proto_4_to_6(skb); if (from_proto == htons(ETH_P_IPV6) && to_proto == htons(ETH_P_IP)) return bpf_skb_proto_6_to_4(skb); return -ENOTSUPP; } BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto, u64, flags) { int ret; if (unlikely(flags)) return -EINVAL; /* General idea is that this helper does the basic groundwork * needed for changing the protocol, and eBPF program fills the * rest through bpf_skb_store_bytes(), bpf_lX_csum_replace() * and other helpers, rather than passing a raw buffer here. * * The rationale is to keep this minimal and without a need to * deal with raw packet data. F.e. even if we would pass buffers * here, the program still needs to call the bpf_lX_csum_replace() * helpers anyway. Plus, this way we keep also separation of * concerns, since f.e. bpf_skb_store_bytes() should only take * care of stores. * * Currently, additional options and extension header space are * not supported, but flags register is reserved so we can adapt * that. For offloads, we mark packet as dodgy, so that headers * need to be verified first. */ ret = bpf_skb_proto_xlat(skb, proto); bpf_compute_data_pointers(skb); return ret; } static const struct bpf_func_proto bpf_skb_change_proto_proto = { .func = bpf_skb_change_proto, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_skb_change_type, struct sk_buff *, skb, u32, pkt_type) { /* We only allow a restricted subset to be changed for now. */ if (unlikely(!skb_pkt_type_ok(skb->pkt_type) || !skb_pkt_type_ok(pkt_type))) return -EINVAL; skb->pkt_type = pkt_type; return 0; } static const struct bpf_func_proto bpf_skb_change_type_proto = { .func = bpf_skb_change_type, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; static u32 bpf_skb_net_base_len(const struct sk_buff *skb) { switch (skb->protocol) { case htons(ETH_P_IP): return sizeof(struct iphdr); case htons(ETH_P_IPV6): return sizeof(struct ipv6hdr); default: return ~0U; } } #define BPF_F_ADJ_ROOM_ENCAP_L3_MASK (BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 | \ BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) #define BPF_F_ADJ_ROOM_DECAP_L3_MASK (BPF_F_ADJ_ROOM_DECAP_L3_IPV4 | \ BPF_F_ADJ_ROOM_DECAP_L3_IPV6) #define BPF_F_ADJ_ROOM_MASK (BPF_F_ADJ_ROOM_FIXED_GSO | \ BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \ BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \ BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \ BPF_F_ADJ_ROOM_ENCAP_L2_ETH | \ BPF_F_ADJ_ROOM_ENCAP_L2( \ BPF_ADJ_ROOM_ENCAP_L2_MASK) | \ BPF_F_ADJ_ROOM_DECAP_L3_MASK) static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, u64 flags) { u8 inner_mac_len = flags >> BPF_ADJ_ROOM_ENCAP_L2_SHIFT; bool encap = flags & BPF_F_ADJ_ROOM_ENCAP_L3_MASK; u16 mac_len = 0, inner_net = 0, inner_trans = 0; const u8 meta_len = skb_metadata_len(skb); unsigned int gso_type = SKB_GSO_DODGY; int ret; if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) { /* udp gso_size delineates datagrams, only allow if fixed */ if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) || !(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) return -ENOTSUPP; } ret = skb_cow_head(skb, meta_len + len_diff); if (unlikely(ret < 0)) return ret; if (encap) { if (skb->protocol != htons(ETH_P_IP) && skb->protocol != htons(ETH_P_IPV6)) return -ENOTSUPP; if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 && flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) return -EINVAL; if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE && flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) return -EINVAL; if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH && inner_mac_len < ETH_HLEN) return -EINVAL; if (skb->encapsulation) return -EALREADY; mac_len = skb->network_header - skb->mac_header; inner_net = skb->network_header; if (inner_mac_len > len_diff) return -EINVAL; inner_trans = skb->transport_header; } ret = bpf_skb_net_hdr_push(skb, off, len_diff); if (unlikely(ret < 0)) return ret; if (encap) { skb->inner_mac_header = inner_net - inner_mac_len; skb->inner_network_header = inner_net; skb->inner_transport_header = inner_trans; if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH) skb_set_inner_protocol(skb, htons(ETH_P_TEB)); else skb_set_inner_protocol(skb, skb->protocol); skb->encapsulation = 1; skb_set_network_header(skb, mac_len); if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) gso_type |= SKB_GSO_UDP_TUNNEL; else if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE) gso_type |= SKB_GSO_GRE; else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) gso_type |= SKB_GSO_IPXIP6; else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4) gso_type |= SKB_GSO_IPXIP4; if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE || flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) { int nh_len = flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 ? sizeof(struct ipv6hdr) : sizeof(struct iphdr); skb_set_transport_header(skb, mac_len + nh_len); } /* Match skb->protocol to new outer l3 protocol */ if (skb->protocol == htons(ETH_P_IP) && flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) bpf_skb_change_protocol(skb, ETH_P_IPV6); else if (skb->protocol == htons(ETH_P_IPV6) && flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4) bpf_skb_change_protocol(skb, ETH_P_IP); } if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); /* Header must be checked, and gso_segs recomputed. */ shinfo->gso_type |= gso_type; shinfo->gso_segs = 0; /* Due to header growth, MSS needs to be downgraded. * There is a BUG_ON() when segmenting the frag_list with * head_frag true, so linearize the skb after downgrading * the MSS. */ if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) { skb_decrease_gso_size(shinfo, len_diff); if (shinfo->frag_list) return skb_linearize(skb); } } return 0; } static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff, u64 flags) { int ret; if (unlikely(flags & ~(BPF_F_ADJ_ROOM_FIXED_GSO | BPF_F_ADJ_ROOM_DECAP_L3_MASK | BPF_F_ADJ_ROOM_NO_CSUM_RESET))) return -EINVAL; if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) { /* udp gso_size delineates datagrams, only allow if fixed */ if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) || !(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) return -ENOTSUPP; } ret = skb_unclone(skb, GFP_ATOMIC); if (unlikely(ret < 0)) return ret; ret = bpf_skb_net_hdr_pop(skb, off, len_diff); if (unlikely(ret < 0)) return ret; /* Match skb->protocol to new outer l3 protocol */ if (skb->protocol == htons(ETH_P_IP) && flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV6) bpf_skb_change_protocol(skb, ETH_P_IPV6); else if (skb->protocol == htons(ETH_P_IPV6) && flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV4) bpf_skb_change_protocol(skb, ETH_P_IP); if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); /* Due to header shrink, MSS can be upgraded. */ if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) skb_increase_gso_size(shinfo, len_diff); /* Header must be checked, and gso_segs recomputed. */ shinfo->gso_type |= SKB_GSO_DODGY; shinfo->gso_segs = 0; } return 0; } #define BPF_SKB_MAX_LEN SKB_MAX_ALLOC BPF_CALL_4(sk_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, u32, mode, u64, flags) { u32 len_diff_abs = abs(len_diff); bool shrink = len_diff < 0; int ret = 0; if (unlikely(flags || mode)) return -EINVAL; if (unlikely(len_diff_abs > 0xfffU)) return -EFAULT; if (!shrink) { ret = skb_cow(skb, len_diff); if (unlikely(ret < 0)) return ret; __skb_push(skb, len_diff_abs); memset(skb->data, 0, len_diff_abs); } else { if (unlikely(!pskb_may_pull(skb, len_diff_abs))) return -ENOMEM; __skb_pull(skb, len_diff_abs); } if (tls_sw_has_ctx_rx(skb->sk)) { struct strp_msg *rxm = strp_msg(skb); rxm->full_len += len_diff; } return ret; } static const struct bpf_func_proto sk_skb_adjust_room_proto = { .func = sk_skb_adjust_room, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, u32, mode, u64, flags) { u32 len_cur, len_diff_abs = abs(len_diff); u32 len_min = bpf_skb_net_base_len(skb); u32 len_max = BPF_SKB_MAX_LEN; __be16 proto = skb->protocol; bool shrink = len_diff < 0; u32 off; int ret; if (unlikely(flags & ~(BPF_F_ADJ_ROOM_MASK | BPF_F_ADJ_ROOM_NO_CSUM_RESET))) return -EINVAL; if (unlikely(len_diff_abs > 0xfffU)) return -EFAULT; if (unlikely(proto != htons(ETH_P_IP) && proto != htons(ETH_P_IPV6))) return -ENOTSUPP; off = skb_mac_header_len(skb); switch (mode) { case BPF_ADJ_ROOM_NET: off += bpf_skb_net_base_len(skb); break; case BPF_ADJ_ROOM_MAC: break; default: return -ENOTSUPP; } if (flags & BPF_F_ADJ_ROOM_DECAP_L3_MASK) { if (!shrink) return -EINVAL; switch (flags & BPF_F_ADJ_ROOM_DECAP_L3_MASK) { case BPF_F_ADJ_ROOM_DECAP_L3_IPV4: len_min = sizeof(struct iphdr); break; case BPF_F_ADJ_ROOM_DECAP_L3_IPV6: len_min = sizeof(struct ipv6hdr); break; default: return -EINVAL; } } len_cur = skb->len - skb_network_offset(skb); if ((shrink && (len_diff_abs >= len_cur || len_cur - len_diff_abs < len_min)) || (!shrink && (skb->len + len_diff_abs > len_max && !skb_is_gso(skb)))) return -ENOTSUPP; ret = shrink ? bpf_skb_net_shrink(skb, off, len_diff_abs, flags) : bpf_skb_net_grow(skb, off, len_diff_abs, flags); if (!ret && !(flags & BPF_F_ADJ_ROOM_NO_CSUM_RESET)) __skb_reset_checksum_unnecessary(skb); bpf_compute_data_pointers(skb); return ret; } static const struct bpf_func_proto bpf_skb_adjust_room_proto = { .func = bpf_skb_adjust_room, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, }; static u32 __bpf_skb_min_len(const struct sk_buff *skb) { int offset = skb_network_offset(skb); u32 min_len = 0; if (offset > 0) min_len = offset; if (skb_transport_header_was_set(skb)) { offset = skb_transport_offset(skb); if (offset > 0) min_len = offset; } if (skb->ip_summed == CHECKSUM_PARTIAL) { offset = skb_checksum_start_offset(skb) + skb->csum_offset + sizeof(__sum16); if (offset > 0) min_len = offset; } return min_len; } static int bpf_skb_grow_rcsum(struct sk_buff *skb, unsigned int new_len) { unsigned int old_len = skb->len; int ret; ret = __skb_grow_rcsum(skb, new_len); if (!ret) memset(skb->data + old_len, 0, new_len - old_len); return ret; } static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len) { return __skb_trim_rcsum(skb, new_len); } static inline int __bpf_skb_change_tail(struct sk_buff *skb, u32 new_len, u64 flags) { u32 max_len = BPF_SKB_MAX_LEN; u32 min_len = __bpf_skb_min_len(skb); int ret; if (unlikely(flags || new_len > max_len || new_len < min_len)) return -EINVAL; if (skb->encapsulation) return -ENOTSUPP; /* The basic idea of this helper is that it's performing the * needed work to either grow or trim an skb, and eBPF program * rewrites the rest via helpers like bpf_skb_store_bytes(), * bpf_lX_csum_replace() and others rather than passing a raw * buffer here. This one is a slow path helper and intended * for replies with control messages. * * Like in bpf_skb_change_proto(), we want to keep this rather * minimal and without protocol specifics so that we are able * to separate concerns as in bpf_skb_store_bytes() should only * be the one responsible for writing buffers. * * It's really expected to be a slow path operation here for * control message replies, so we're implicitly linearizing, * uncloning and drop offloads from the skb by this. */ ret = __bpf_try_make_writable(skb, skb->len); if (!ret) { if (new_len > skb->len) ret = bpf_skb_grow_rcsum(skb, new_len); else if (new_len < skb->len) ret = bpf_skb_trim_rcsum(skb, new_len); if (!ret && skb_is_gso(skb)) skb_gso_reset(skb); } return ret; } BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len, u64, flags) { int ret = __bpf_skb_change_tail(skb, new_len, flags); bpf_compute_data_pointers(skb); return ret; } static const struct bpf_func_proto bpf_skb_change_tail_proto = { .func = bpf_skb_change_tail, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; BPF_CALL_3(sk_skb_change_tail, struct sk_buff *, skb, u32, new_len, u64, flags) { return __bpf_skb_change_tail(skb, new_len, flags); } static const struct bpf_func_proto sk_skb_change_tail_proto = { .func = sk_skb_change_tail, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room, u64 flags) { const u8 meta_len = skb_metadata_len(skb); u32 max_len = BPF_SKB_MAX_LEN; u32 new_len = skb->len + head_room; int ret; if (unlikely(flags || (int)head_room < 0 || (!skb_is_gso(skb) && new_len > max_len) || new_len < skb->len)) return -EINVAL; ret = skb_cow(skb, meta_len + head_room); if (likely(!ret)) { /* Idea for this helper is that we currently only * allow to expand on mac header. This means that * skb->protocol network header, etc, stay as is. * Compared to bpf_skb_change_tail(), we're more * flexible due to not needing to linearize or * reset GSO. Intention for this helper is to be * used by an L3 skb that needs to push mac header * for redirection into L2 device. */ __skb_push(skb, head_room); skb_postpush_data_move(skb, head_room, 0); memset(skb->data, 0, head_room); skb_reset_mac_header(skb); skb_reset_mac_len(skb); } return ret; } BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room, u64, flags) { int ret = __bpf_skb_change_head(skb, head_room, flags); bpf_compute_data_pointers(skb); return ret; } static const struct bpf_func_proto bpf_skb_change_head_proto = { .func = bpf_skb_change_head, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; BPF_CALL_3(sk_skb_change_head, struct sk_buff *, skb, u32, head_room, u64, flags) { return __bpf_skb_change_head(skb, head_room, flags); } static const struct bpf_func_proto sk_skb_change_head_proto = { .func = sk_skb_change_head, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; BPF_CALL_1(bpf_xdp_get_buff_len, struct xdp_buff*, xdp) { return xdp_get_buff_len(xdp); } static const struct bpf_func_proto bpf_xdp_get_buff_len_proto = { .func = bpf_xdp_get_buff_len, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BTF_ID_LIST_SINGLE(bpf_xdp_get_buff_len_bpf_ids, struct, xdp_buff) const struct bpf_func_proto bpf_xdp_get_buff_len_trace_proto = { .func = bpf_xdp_get_buff_len, .gpl_only = false, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &bpf_xdp_get_buff_len_bpf_ids[0], }; static unsigned long xdp_get_metalen(const struct xdp_buff *xdp) { return xdp_data_meta_unsupported(xdp) ? 0 : xdp->data - xdp->data_meta; } BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset) { void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame); unsigned long metalen = xdp_get_metalen(xdp); void *data_start = xdp_frame_end + metalen; void *data = xdp->data + offset; if (unlikely(data < data_start || data > xdp->data_end - ETH_HLEN)) return -EINVAL; if (metalen) memmove(xdp->data_meta + offset, xdp->data_meta, metalen); xdp->data_meta += offset; xdp->data = data; return 0; } static const struct bpf_func_proto bpf_xdp_adjust_head_proto = { .func = bpf_xdp_adjust_head, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, void *buf, unsigned long len, bool flush) { unsigned long ptr_len, ptr_off = 0; skb_frag_t *next_frag, *end_frag; struct skb_shared_info *sinfo; void *src, *dst; u8 *ptr_buf; if (likely(xdp->data_end - xdp->data >= off + len)) { src = flush ? buf : xdp->data + off; dst = flush ? xdp->data + off : buf; memcpy(dst, src, len); return; } sinfo = xdp_get_shared_info_from_buff(xdp); end_frag = &sinfo->frags[sinfo->nr_frags]; next_frag = &sinfo->frags[0]; ptr_len = xdp->data_end - xdp->data; ptr_buf = xdp->data; while (true) { if (off < ptr_off + ptr_len) { unsigned long copy_off = off - ptr_off; unsigned long copy_len = min(len, ptr_len - copy_off); src = flush ? buf : ptr_buf + copy_off; dst = flush ? ptr_buf + copy_off : buf; memcpy(dst, src, copy_len); off += copy_len; len -= copy_len; buf += copy_len; } if (!len || next_frag == end_frag) break; ptr_off += ptr_len; ptr_buf = skb_frag_address(next_frag); ptr_len = skb_frag_size(next_frag); next_frag++; } } void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len) { u32 size = xdp->data_end - xdp->data; struct skb_shared_info *sinfo; void *addr = xdp->data; int i; if (unlikely(offset > 0xffff || len > 0xffff)) return ERR_PTR(-EFAULT); if (unlikely(offset + len > xdp_get_buff_len(xdp))) return ERR_PTR(-EINVAL); if (likely(offset < size)) /* linear area */ goto out; sinfo = xdp_get_shared_info_from_buff(xdp); offset -= size; for (i = 0; i < sinfo->nr_frags; i++) { /* paged area */ u32 frag_size = skb_frag_size(&sinfo->frags[i]); if (offset < frag_size) { addr = skb_frag_address(&sinfo->frags[i]); size = frag_size; break; } offset -= frag_size; } out: return offset + len <= size ? addr + offset : NULL; } BPF_CALL_4(bpf_xdp_load_bytes, struct xdp_buff *, xdp, u32, offset, void *, buf, u32, len) { void *ptr; ptr = bpf_xdp_pointer(xdp, offset, len); if (IS_ERR(ptr)) return PTR_ERR(ptr); if (!ptr) bpf_xdp_copy_buf(xdp, offset, buf, len, false); else memcpy(buf, ptr, len); return 0; } static const struct bpf_func_proto bpf_xdp_load_bytes_proto = { .func = bpf_xdp_load_bytes, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, }; int __bpf_xdp_load_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len) { return ____bpf_xdp_load_bytes(xdp, offset, buf, len); } BPF_CALL_4(bpf_xdp_store_bytes, struct xdp_buff *, xdp, u32, offset, void *, buf, u32, len) { void *ptr; ptr = bpf_xdp_pointer(xdp, offset, len); if (IS_ERR(ptr)) return PTR_ERR(ptr); if (!ptr) bpf_xdp_copy_buf(xdp, offset, buf, len, true); else memcpy(ptr, buf, len); return 0; } static const struct bpf_func_proto bpf_xdp_store_bytes_proto = { .func = bpf_xdp_store_bytes, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, }; int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len) { return ____bpf_xdp_store_bytes(xdp, offset, buf, len); } static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset) { struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); skb_frag_t *frag = &sinfo->frags[sinfo->nr_frags - 1]; struct xdp_rxq_info *rxq = xdp->rxq; unsigned int tailroom; if (!rxq->frag_size || rxq->frag_size > xdp->frame_sz) return -EOPNOTSUPP; tailroom = rxq->frag_size - skb_frag_size(frag) - skb_frag_off(frag); if (unlikely(offset > tailroom)) return -EINVAL; memset(skb_frag_address(frag) + skb_frag_size(frag), 0, offset); skb_frag_size_add(frag, offset); sinfo->xdp_frags_size += offset; if (rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) xsk_buff_get_tail(xdp)->data_end += offset; return 0; } static struct xdp_buff *bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink, bool tail, bool release) { struct xdp_buff *zc_frag = tail ? xsk_buff_get_tail(xdp) : xsk_buff_get_head(xdp); if (release) { xsk_buff_del_frag(zc_frag); } else { if (tail) zc_frag->data_end -= shrink; else zc_frag->data += shrink; } return zc_frag; } static bool bpf_xdp_shrink_data(struct xdp_buff *xdp, skb_frag_t *frag, int shrink, bool tail) { enum xdp_mem_type mem_type = xdp->rxq->mem.type; bool release = skb_frag_size(frag) == shrink; netmem_ref netmem = skb_frag_netmem(frag); struct xdp_buff *zc_frag = NULL; if (mem_type == MEM_TYPE_XSK_BUFF_POOL) { netmem = 0; zc_frag = bpf_xdp_shrink_data_zc(xdp, shrink, tail, release); } if (release) { __xdp_return(netmem, mem_type, false, zc_frag); } else { if (!tail) skb_frag_off_add(frag, shrink); skb_frag_size_sub(frag, shrink); } return release; } static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset) { struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); int i, n_frags_free = 0, len_free = 0; if (unlikely(offset > (int)xdp_get_buff_len(xdp) - ETH_HLEN)) return -EINVAL; for (i = sinfo->nr_frags - 1; i >= 0 && offset > 0; i--) { skb_frag_t *frag = &sinfo->frags[i]; int shrink = min_t(int, offset, skb_frag_size(frag)); len_free += shrink; offset -= shrink; if (bpf_xdp_shrink_data(xdp, frag, shrink, true)) n_frags_free++; } sinfo->nr_frags -= n_frags_free; sinfo->xdp_frags_size -= len_free; if (unlikely(!sinfo->nr_frags)) { xdp_buff_clear_frags_flag(xdp); xdp_buff_clear_frag_pfmemalloc(xdp); xdp->data_end -= offset; } return 0; } BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset) { void *data_hard_end = xdp_data_hard_end(xdp); /* use xdp->frame_sz */ void *data_end = xdp->data_end + offset; if (unlikely(xdp_buff_has_frags(xdp))) { /* non-linear xdp buff */ if (offset < 0) return bpf_xdp_frags_shrink_tail(xdp, -offset); return bpf_xdp_frags_increase_tail(xdp, offset); } /* Notice that xdp_data_hard_end have reserved some tailroom */ if (unlikely(data_end > data_hard_end)) return -EINVAL; if (unlikely(data_end < xdp->data + ETH_HLEN)) return -EINVAL; /* Clear memory area on grow, can contain uninit kernel memory */ if (offset > 0) memset(xdp->data_end, 0, offset); xdp->data_end = data_end; return 0; } static const struct bpf_func_proto bpf_xdp_adjust_tail_proto = { .func = bpf_xdp_adjust_tail, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset) { void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame); void *meta = xdp->data_meta + offset; unsigned long metalen = xdp->data - meta; if (xdp_data_meta_unsupported(xdp)) return -ENOTSUPP; if (unlikely(meta < xdp_frame_end || meta > xdp->data)) return -EINVAL; if (unlikely(xdp_metalen_invalid(metalen))) return -EACCES; xdp->data_meta = meta; return 0; } static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = { .func = bpf_xdp_adjust_meta, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; /** * DOC: xdp redirect * * XDP_REDIRECT works by a three-step process, implemented in the functions * below: * * 1. The bpf_redirect() and bpf_redirect_map() helpers will lookup the target * of the redirect and store it (along with some other metadata) in a per-CPU * struct bpf_redirect_info. * * 2. When the program returns the XDP_REDIRECT return code, the driver will * call xdp_do_redirect() which will use the information in struct * bpf_redirect_info to actually enqueue the frame into a map type-specific * bulk queue structure. * * 3. Before exiting its NAPI poll loop, the driver will call * xdp_do_flush(), which will flush all the different bulk queues, * thus completing the redirect. Note that xdp_do_flush() must be * called before napi_complete_done() in the driver, as the * XDP_REDIRECT logic relies on being inside a single NAPI instance * through to the xdp_do_flush() call for RCU protection of all * in-kernel data structures. */ /* * Pointers to the map entries will be kept around for this whole sequence of * steps, protected by RCU. However, there is no top-level rcu_read_lock() in * the core code; instead, the RCU protection relies on everything happening * inside a single NAPI poll sequence, which means it's between a pair of calls * to local_bh_disable()/local_bh_enable(). * * The map entries are marked as __rcu and the map code makes sure to * dereference those pointers with rcu_dereference_check() in a way that works * for both sections that to hold an rcu_read_lock() and sections that are * called from NAPI without a separate rcu_read_lock(). The code below does not * use RCU annotations, but relies on those in the map code. */ void xdp_do_flush(void) { struct list_head *lh_map, *lh_dev, *lh_xsk; bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk); if (lh_dev) __dev_flush(lh_dev); if (lh_map) __cpu_map_flush(lh_map); if (lh_xsk) __xsk_map_flush(lh_xsk); } EXPORT_SYMBOL_GPL(xdp_do_flush); #if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL) void xdp_do_check_flushed(struct napi_struct *napi) { struct list_head *lh_map, *lh_dev, *lh_xsk; bool missed = false; bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk); if (lh_dev) { __dev_flush(lh_dev); missed = true; } if (lh_map) { __cpu_map_flush(lh_map); missed = true; } if (lh_xsk) { __xsk_map_flush(lh_xsk); missed = true; } WARN_ONCE(missed, "Missing xdp_do_flush() invocation after NAPI by %ps\n", napi->poll); } #endif DEFINE_STATIC_KEY_FALSE(bpf_master_redirect_enabled_key); EXPORT_SYMBOL_GPL(bpf_master_redirect_enabled_key); u32 xdp_master_redirect(struct xdp_buff *xdp) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); struct net_device *master, *slave; master = netdev_master_upper_dev_get_rcu(xdp->rxq->dev); slave = master->netdev_ops->ndo_xdp_get_xmit_slave(master, xdp); if (slave && slave != xdp->rxq->dev) { /* The target device is different from the receiving device, so * redirect it to the new device. * Using XDP_REDIRECT gets the correct behaviour from XDP enabled * drivers to unmap the packet from their rx ring. */ ri->tgt_index = slave->ifindex; ri->map_id = INT_MAX; ri->map_type = BPF_MAP_TYPE_UNSPEC; return XDP_REDIRECT; } return XDP_TX; } EXPORT_SYMBOL_GPL(xdp_master_redirect); static inline int __xdp_do_redirect_xsk(struct bpf_redirect_info *ri, const struct net_device *dev, struct xdp_buff *xdp, const struct bpf_prog *xdp_prog) { enum bpf_map_type map_type = ri->map_type; void *fwd = ri->tgt_value; u32 map_id = ri->map_id; int err; ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ ri->map_type = BPF_MAP_TYPE_UNSPEC; err = __xsk_map_redirect(fwd, xdp); if (unlikely(err)) goto err; _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index); return 0; err: _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err); return err; } static __always_inline int __xdp_do_redirect_frame(struct bpf_redirect_info *ri, struct net_device *dev, struct xdp_frame *xdpf, const struct bpf_prog *xdp_prog) { enum bpf_map_type map_type = ri->map_type; void *fwd = ri->tgt_value; u32 map_id = ri->map_id; u32 flags = ri->flags; struct bpf_map *map; int err; ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ ri->flags = 0; ri->map_type = BPF_MAP_TYPE_UNSPEC; if (unlikely(!xdpf)) { err = -EOVERFLOW; goto err; } switch (map_type) { case BPF_MAP_TYPE_DEVMAP: fallthrough; case BPF_MAP_TYPE_DEVMAP_HASH: if (unlikely(flags & BPF_F_BROADCAST)) { map = READ_ONCE(ri->map); /* The map pointer is cleared when the map is being torn * down by dev_map_free() */ if (unlikely(!map)) { err = -ENOENT; break; } WRITE_ONCE(ri->map, NULL); err = dev_map_enqueue_multi(xdpf, dev, map, flags & BPF_F_EXCLUDE_INGRESS); } else { err = dev_map_enqueue(fwd, xdpf, dev); } break; case BPF_MAP_TYPE_CPUMAP: err = cpu_map_enqueue(fwd, xdpf, dev); break; case BPF_MAP_TYPE_UNSPEC: if (map_id == INT_MAX) { fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index); if (unlikely(!fwd)) { err = -EINVAL; break; } err = dev_xdp_enqueue(fwd, xdpf, dev); break; } fallthrough; default: err = -EBADRQC; } if (unlikely(err)) goto err; _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index); return 0; err: _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err); return err; } int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, const struct bpf_prog *xdp_prog) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); enum bpf_map_type map_type = ri->map_type; if (map_type == BPF_MAP_TYPE_XSKMAP) return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog); return __xdp_do_redirect_frame(ri, dev, xdp_convert_buff_to_frame(xdp), xdp_prog); } EXPORT_SYMBOL_GPL(xdp_do_redirect); int xdp_do_redirect_frame(struct net_device *dev, struct xdp_buff *xdp, struct xdp_frame *xdpf, const struct bpf_prog *xdp_prog) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); enum bpf_map_type map_type = ri->map_type; if (map_type == BPF_MAP_TYPE_XSKMAP) return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog); return __xdp_do_redirect_frame(ri, dev, xdpf, xdp_prog); } EXPORT_SYMBOL_GPL(xdp_do_redirect_frame); static int xdp_do_generic_redirect_map(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, const struct bpf_prog *xdp_prog, void *fwd, enum bpf_map_type map_type, u32 map_id, u32 flags) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); struct bpf_map *map; int err; switch (map_type) { case BPF_MAP_TYPE_DEVMAP: fallthrough; case BPF_MAP_TYPE_DEVMAP_HASH: if (unlikely(flags & BPF_F_BROADCAST)) { map = READ_ONCE(ri->map); /* The map pointer is cleared when the map is being torn * down by dev_map_free() */ if (unlikely(!map)) { err = -ENOENT; break; } WRITE_ONCE(ri->map, NULL); err = dev_map_redirect_multi(dev, skb, xdp_prog, map, flags & BPF_F_EXCLUDE_INGRESS); } else { err = dev_map_generic_redirect(fwd, skb, xdp_prog); } if (unlikely(err)) goto err; break; case BPF_MAP_TYPE_XSKMAP: err = xsk_generic_rcv(fwd, xdp); if (err) goto err; consume_skb(skb); break; case BPF_MAP_TYPE_CPUMAP: err = cpu_map_generic_redirect(fwd, skb); if (unlikely(err)) goto err; break; default: err = -EBADRQC; goto err; } _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index); return 0; err: _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err); return err; } int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, const struct bpf_prog *xdp_prog) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); enum bpf_map_type map_type = ri->map_type; void *fwd = ri->tgt_value; u32 map_id = ri->map_id; u32 flags = ri->flags; int err; ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ ri->flags = 0; ri->map_type = BPF_MAP_TYPE_UNSPEC; if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) { fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index); if (unlikely(!fwd)) { err = -EINVAL; goto err; } err = xdp_ok_fwd_dev(fwd, skb->len); if (unlikely(err)) goto err; skb->dev = fwd; _trace_xdp_redirect(dev, xdp_prog, ri->tgt_index); generic_xdp_tx(skb, xdp_prog); return 0; } return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, fwd, map_type, map_id, flags); err: _trace_xdp_redirect_err(dev, xdp_prog, ri->tgt_index, err); return err; } BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); if (unlikely(flags)) return XDP_ABORTED; /* NB! Map type UNSPEC and map_id == INT_MAX (never generated * by map_idr) is used for ifindex based XDP redirect. */ ri->tgt_index = ifindex; ri->map_id = INT_MAX; ri->map_type = BPF_MAP_TYPE_UNSPEC; return XDP_REDIRECT; } static const struct bpf_func_proto bpf_xdp_redirect_proto = { .func = bpf_xdp_redirect, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, .arg2_type = ARG_ANYTHING, }; BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u64, key, u64, flags) { return map->ops->map_redirect(map, key, flags); } static const struct bpf_func_proto bpf_xdp_redirect_map_proto = { .func = bpf_xdp_redirect_map, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; static unsigned long bpf_skb_copy(void *dst_buff, const void *skb, unsigned long off, unsigned long len) { void *ptr = skb_header_pointer(skb, off, len, dst_buff); if (unlikely(!ptr)) return len; if (ptr != dst_buff) memcpy(dst_buff, ptr, len); return 0; } BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map *, map, u64, flags, void *, meta, u64, meta_size) { u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32; if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) return -EINVAL; if (unlikely(!skb || skb_size > skb->len)) return -EFAULT; return bpf_event_output(map, flags, meta, meta_size, skb, skb_size, bpf_skb_copy); } static const struct bpf_func_proto bpf_skb_event_output_proto = { .func = bpf_skb_event_output, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; BTF_ID_LIST_SINGLE(bpf_skb_output_btf_ids, struct, sk_buff) const struct bpf_func_proto bpf_skb_output_proto = { .func = bpf_skb_event_output, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &bpf_skb_output_btf_ids[0], .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; static unsigned short bpf_tunnel_key_af(u64 flags) { return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET; } BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, skb, struct bpf_tunnel_key *, to, u32, size, u64, flags) { const struct ip_tunnel_info *info = skb_tunnel_info(skb); u8 compat[sizeof(struct bpf_tunnel_key)]; void *to_orig = to; int err; if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_TUNINFO_FLAGS)))) { err = -EINVAL; goto err_clear; } if (ip_tunnel_info_af(info) != bpf_tunnel_key_af(flags)) { err = -EPROTO; goto err_clear; } if (unlikely(size != sizeof(struct bpf_tunnel_key))) { err = -EINVAL; switch (size) { case offsetof(struct bpf_tunnel_key, local_ipv6[0]): case offsetof(struct bpf_tunnel_key, tunnel_label): case offsetof(struct bpf_tunnel_key, tunnel_ext): goto set_compat; case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): /* Fixup deprecated structure layouts here, so we have * a common path later on. */ if (ip_tunnel_info_af(info) != AF_INET) goto err_clear; set_compat: to = (struct bpf_tunnel_key *)compat; break; default: goto err_clear; } } to->tunnel_id = be64_to_cpu(info->key.tun_id); to->tunnel_tos = info->key.tos; to->tunnel_ttl = info->key.ttl; if (flags & BPF_F_TUNINFO_FLAGS) to->tunnel_flags = ip_tunnel_flags_to_be16(info->key.tun_flags); else to->tunnel_ext = 0; if (flags & BPF_F_TUNINFO_IPV6) { memcpy(to->remote_ipv6, &info->key.u.ipv6.src, sizeof(to->remote_ipv6)); memcpy(to->local_ipv6, &info->key.u.ipv6.dst, sizeof(to->local_ipv6)); to->tunnel_label = be32_to_cpu(info->key.label); } else { to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src); memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3); to->local_ipv4 = be32_to_cpu(info->key.u.ipv4.dst); memset(&to->local_ipv6[1], 0, sizeof(__u32) * 3); to->tunnel_label = 0; } if (unlikely(size != sizeof(struct bpf_tunnel_key))) memcpy(to_orig, to, size); return 0; err_clear: memset(to_orig, 0, size); return err; } static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = { .func = bpf_skb_get_tunnel_key, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; BPF_CALL_3(bpf_skb_get_tunnel_opt, struct sk_buff *, skb, u8 *, to, u32, size) { const struct ip_tunnel_info *info = skb_tunnel_info(skb); int err; if (unlikely(!info || !ip_tunnel_is_options_present(info->key.tun_flags))) { err = -ENOENT; goto err_clear; } if (unlikely(size < info->options_len)) { err = -ENOMEM; goto err_clear; } ip_tunnel_info_opts_get(to, info); if (size > info->options_len) memset(to + info->options_len, 0, size - info->options_len); return info->options_len; err_clear: memset(to, 0, size); return err; } static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = { .func = bpf_skb_get_tunnel_opt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE, }; static struct metadata_dst __percpu *md_dst; BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, const struct bpf_tunnel_key *, from, u32, size, u64, flags) { struct metadata_dst *md = this_cpu_ptr(md_dst); u8 compat[sizeof(struct bpf_tunnel_key)]; struct ip_tunnel_info *info; if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX | BPF_F_DONT_FRAGMENT | BPF_F_SEQ_NUMBER | BPF_F_NO_TUNNEL_KEY))) return -EINVAL; if (unlikely(size != sizeof(struct bpf_tunnel_key))) { switch (size) { case offsetof(struct bpf_tunnel_key, local_ipv6[0]): case offsetof(struct bpf_tunnel_key, tunnel_label): case offsetof(struct bpf_tunnel_key, tunnel_ext): case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): /* Fixup deprecated structure layouts here, so we have * a common path later on. */ memcpy(compat, from, size); memset(compat + size, 0, sizeof(compat) - size); from = (const struct bpf_tunnel_key *) compat; break; default: return -EINVAL; } } if (unlikely((!(flags & BPF_F_TUNINFO_IPV6) && from->tunnel_label) || from->tunnel_ext)) return -EINVAL; skb_dst_drop(skb); dst_hold((struct dst_entry *) md); skb_dst_set(skb, (struct dst_entry *) md); info = &md->u.tun_info; memset(info, 0, sizeof(*info)); info->mode = IP_TUNNEL_INFO_TX; __set_bit(IP_TUNNEL_NOCACHE_BIT, info->key.tun_flags); __assign_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, info->key.tun_flags, flags & BPF_F_DONT_FRAGMENT); __assign_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags, !(flags & BPF_F_ZERO_CSUM_TX)); __assign_bit(IP_TUNNEL_SEQ_BIT, info->key.tun_flags, flags & BPF_F_SEQ_NUMBER); __assign_bit(IP_TUNNEL_KEY_BIT, info->key.tun_flags, !(flags & BPF_F_NO_TUNNEL_KEY)); info->key.tun_id = cpu_to_be64(from->tunnel_id); info->key.tos = from->tunnel_tos; info->key.ttl = from->tunnel_ttl; if (flags & BPF_F_TUNINFO_IPV6) { info->mode |= IP_TUNNEL_INFO_IPV6; memcpy(&info->key.u.ipv6.dst, from->remote_ipv6, sizeof(from->remote_ipv6)); memcpy(&info->key.u.ipv6.src, from->local_ipv6, sizeof(from->local_ipv6)); info->key.label = cpu_to_be32(from->tunnel_label) & IPV6_FLOWLABEL_MASK; } else { info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4); info->key.u.ipv4.src = cpu_to_be32(from->local_ipv4); info->key.flow_flags = FLOWI_FLAG_ANYSRC; } return 0; } static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { .func = bpf_skb_set_tunnel_key, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb, const u8 *, from, u32, size) { struct ip_tunnel_info *info = skb_tunnel_info(skb); const struct metadata_dst *md = this_cpu_ptr(md_dst); IP_TUNNEL_DECLARE_FLAGS(present) = { }; if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1)))) return -EINVAL; if (unlikely(size > IP_TUNNEL_OPTS_MAX)) return -ENOMEM; ip_tunnel_set_options_present(present); ip_tunnel_info_opts_set(info, from, size, present); return 0; } static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = { .func = bpf_skb_set_tunnel_opt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, }; static const struct bpf_func_proto * bpf_get_skb_set_tunnel_proto(enum bpf_func_id which) { if (!md_dst) { struct metadata_dst __percpu *tmp; tmp = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX, METADATA_IP_TUNNEL, GFP_KERNEL); if (!tmp) return NULL; if (cmpxchg(&md_dst, NULL, tmp)) metadata_dst_free_percpu(tmp); } switch (which) { case BPF_FUNC_skb_set_tunnel_key: return &bpf_skb_set_tunnel_key_proto; case BPF_FUNC_skb_set_tunnel_opt: return &bpf_skb_set_tunnel_opt_proto; default: return NULL; } } BPF_CALL_3(bpf_skb_under_cgroup, struct sk_buff *, skb, struct bpf_map *, map, u32, idx) { struct bpf_array *array = container_of(map, struct bpf_array, map); struct cgroup *cgrp; struct sock *sk; sk = skb_to_full_sk(skb); if (!sk || !sk_fullsock(sk)) return -ENOENT; if (unlikely(idx >= array->map.max_entries)) return -E2BIG; cgrp = READ_ONCE(array->ptrs[idx]); if (unlikely(!cgrp)) return -EAGAIN; return sk_under_cgroup_hierarchy(sk, cgrp); } static const struct bpf_func_proto bpf_skb_under_cgroup_proto = { .func = bpf_skb_under_cgroup, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, }; #ifdef CONFIG_SOCK_CGROUP_DATA static inline u64 __bpf_sk_cgroup_id(struct sock *sk) { struct cgroup *cgrp; sk = sk_to_full_sk(sk); if (!sk || !sk_fullsock(sk)) return 0; cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); return cgroup_id(cgrp); } BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb) { return __bpf_sk_cgroup_id(skb->sk); } static const struct bpf_func_proto bpf_skb_cgroup_id_proto = { .func = bpf_skb_cgroup_id, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; static inline u64 __bpf_sk_ancestor_cgroup_id(struct sock *sk, int ancestor_level) { struct cgroup *ancestor; struct cgroup *cgrp; sk = sk_to_full_sk(sk); if (!sk || !sk_fullsock(sk)) return 0; cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); ancestor = cgroup_ancestor(cgrp, ancestor_level); if (!ancestor) return 0; return cgroup_id(ancestor); } BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, ancestor_level) { return __bpf_sk_ancestor_cgroup_id(skb->sk, ancestor_level); } static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = { .func = bpf_skb_ancestor_cgroup_id, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; BPF_CALL_1(bpf_sk_cgroup_id, struct sock *, sk) { return __bpf_sk_cgroup_id(sk); } static const struct bpf_func_proto bpf_sk_cgroup_id_proto = { .func = bpf_sk_cgroup_id, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, }; BPF_CALL_2(bpf_sk_ancestor_cgroup_id, struct sock *, sk, int, ancestor_level) { return __bpf_sk_ancestor_cgroup_id(sk, ancestor_level); } static const struct bpf_func_proto bpf_sk_ancestor_cgroup_id_proto = { .func = bpf_sk_ancestor_cgroup_id, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg2_type = ARG_ANYTHING, }; #endif static unsigned long bpf_xdp_copy(void *dst, const void *ctx, unsigned long off, unsigned long len) { struct xdp_buff *xdp = (struct xdp_buff *)ctx; bpf_xdp_copy_buf(xdp, off, dst, len, false); return 0; } BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map, u64, flags, void *, meta, u64, meta_size) { u64 xdp_size = (flags & BPF_F_CTXLEN_MASK) >> 32; if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) return -EINVAL; if (unlikely(!xdp || xdp_size > xdp_get_buff_len(xdp))) return -EFAULT; return bpf_event_output(map, flags, meta, meta_size, xdp, xdp_size, bpf_xdp_copy); } static const struct bpf_func_proto bpf_xdp_event_output_proto = { .func = bpf_xdp_event_output, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; BTF_ID_LIST_SINGLE(bpf_xdp_output_btf_ids, struct, xdp_buff) const struct bpf_func_proto bpf_xdp_output_proto = { .func = bpf_xdp_event_output, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &bpf_xdp_output_btf_ids[0], .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb) { return skb->sk ? __sock_gen_cookie(skb->sk) : 0; } static const struct bpf_func_proto bpf_get_socket_cookie_proto = { .func = bpf_get_socket_cookie, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_1(bpf_get_socket_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx) { return __sock_gen_cookie(ctx->sk); } static const struct bpf_func_proto bpf_get_socket_cookie_sock_addr_proto = { .func = bpf_get_socket_cookie_sock_addr, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_1(bpf_get_socket_cookie_sock, struct sock *, ctx) { return __sock_gen_cookie(ctx); } static const struct bpf_func_proto bpf_get_socket_cookie_sock_proto = { .func = bpf_get_socket_cookie_sock, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_1(bpf_get_socket_ptr_cookie, struct sock *, sk) { return sk ? sock_gen_cookie(sk) : 0; } const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto = { .func = bpf_get_socket_ptr_cookie, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON | PTR_MAYBE_NULL, }; BPF_CALL_1(bpf_get_socket_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx) { return __sock_gen_cookie(ctx->sk); } static const struct bpf_func_proto bpf_get_socket_cookie_sock_ops_proto = { .func = bpf_get_socket_cookie_sock_ops, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; static u64 __bpf_get_netns_cookie(struct sock *sk) { const struct net *net = sk ? sock_net(sk) : &init_net; return net->net_cookie; } BPF_CALL_1(bpf_get_netns_cookie, struct sk_buff *, skb) { return __bpf_get_netns_cookie(skb && skb->sk ? skb->sk : NULL); } static const struct bpf_func_proto bpf_get_netns_cookie_proto = { .func = bpf_get_netns_cookie, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX_OR_NULL, }; BPF_CALL_1(bpf_get_netns_cookie_sock, struct sock *, ctx) { return __bpf_get_netns_cookie(ctx); } static const struct bpf_func_proto bpf_get_netns_cookie_sock_proto = { .func = bpf_get_netns_cookie_sock, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX_OR_NULL, }; BPF_CALL_1(bpf_get_netns_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx) { return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL); } static const struct bpf_func_proto bpf_get_netns_cookie_sock_addr_proto = { .func = bpf_get_netns_cookie_sock_addr, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX_OR_NULL, }; BPF_CALL_1(bpf_get_netns_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx) { return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL); } static const struct bpf_func_proto bpf_get_netns_cookie_sock_ops_proto = { .func = bpf_get_netns_cookie_sock_ops, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX_OR_NULL, }; BPF_CALL_1(bpf_get_netns_cookie_sk_msg, struct sk_msg *, ctx) { return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL); } static const struct bpf_func_proto bpf_get_netns_cookie_sk_msg_proto = { .func = bpf_get_netns_cookie_sk_msg, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX_OR_NULL, }; BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb) { struct sock *sk = sk_to_full_sk(skb->sk); kuid_t kuid; if (!sk || !sk_fullsock(sk)) return overflowuid; kuid = sock_net_uid(sock_net(sk), sk); return from_kuid_munged(sock_net(sk)->user_ns, kuid); } static const struct bpf_func_proto bpf_get_socket_uid_proto = { .func = bpf_get_socket_uid, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; static int sk_bpf_set_get_cb_flags(struct sock *sk, char *optval, bool getopt) { u32 sk_bpf_cb_flags; if (getopt) { *(u32 *)optval = sk->sk_bpf_cb_flags; return 0; } sk_bpf_cb_flags = *(u32 *)optval; if (sk_bpf_cb_flags & ~SK_BPF_CB_MASK) return -EINVAL; sk->sk_bpf_cb_flags = sk_bpf_cb_flags; return 0; } static int sol_socket_sockopt(struct sock *sk, int optname, char *optval, int *optlen, bool getopt) { switch (optname) { case SO_REUSEADDR: case SO_SNDBUF: case SO_RCVBUF: case SO_KEEPALIVE: case SO_PRIORITY: case SO_REUSEPORT: case SO_RCVLOWAT: case SO_MARK: case SO_MAX_PACING_RATE: case SO_BINDTOIFINDEX: case SO_TXREHASH: case SK_BPF_CB_FLAGS: if (*optlen != sizeof(int)) return -EINVAL; break; case SO_BINDTODEVICE: break; default: return -EINVAL; } if (optname == SK_BPF_CB_FLAGS) return sk_bpf_set_get_cb_flags(sk, optval, getopt); if (getopt) { if (optname == SO_BINDTODEVICE) return -EINVAL; return sk_getsockopt(sk, SOL_SOCKET, optname, KERNEL_SOCKPTR(optval), KERNEL_SOCKPTR(optlen)); } return sk_setsockopt(sk, SOL_SOCKET, optname, KERNEL_SOCKPTR(optval), *optlen); } static int bpf_sol_tcp_getsockopt(struct sock *sk, int optname, char *optval, int optlen) { if (optlen != sizeof(int)) return -EINVAL; switch (optname) { case TCP_BPF_SOCK_OPS_CB_FLAGS: { int cb_flags = tcp_sk(sk)->bpf_sock_ops_cb_flags; memcpy(optval, &cb_flags, optlen); break; } case TCP_BPF_RTO_MIN: { int rto_min_us = jiffies_to_usecs(inet_csk(sk)->icsk_rto_min); memcpy(optval, &rto_min_us, optlen); break; } case TCP_BPF_DELACK_MAX: { int delack_max_us = jiffies_to_usecs(inet_csk(sk)->icsk_delack_max); memcpy(optval, &delack_max_us, optlen); break; } default: return -EINVAL; } return 0; } static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname, char *optval, int optlen) { struct tcp_sock *tp = tcp_sk(sk); unsigned long timeout; int val; if (optlen != sizeof(int)) return -EINVAL; val = *(int *)optval; /* Only some options are supported */ switch (optname) { case TCP_BPF_IW: if (val <= 0 || tp->data_segs_out > tp->syn_data) return -EINVAL; tcp_snd_cwnd_set(tp, val); break; case TCP_BPF_SNDCWND_CLAMP: if (val <= 0) return -EINVAL; tp->snd_cwnd_clamp = val; tp->snd_ssthresh = val; break; case TCP_BPF_DELACK_MAX: timeout = usecs_to_jiffies(val); if (timeout > TCP_DELACK_MAX || timeout < TCP_TIMEOUT_MIN) return -EINVAL; inet_csk(sk)->icsk_delack_max = timeout; break; case TCP_BPF_RTO_MIN: timeout = usecs_to_jiffies(val); if (timeout > TCP_RTO_MIN || timeout < TCP_TIMEOUT_MIN) return -EINVAL; inet_csk(sk)->icsk_rto_min = timeout; break; case TCP_BPF_SOCK_OPS_CB_FLAGS: if (val & ~(BPF_SOCK_OPS_ALL_CB_FLAGS)) return -EINVAL; tp->bpf_sock_ops_cb_flags = val; break; default: return -EINVAL; } return 0; } static int sol_tcp_sockopt_congestion(struct sock *sk, char *optval, int *optlen, bool getopt) { struct tcp_sock *tp; int ret; if (*optlen < 2) return -EINVAL; if (getopt) { if (!inet_csk(sk)->icsk_ca_ops) return -EINVAL; /* BPF expects NULL-terminated tcp-cc string */ optval[--(*optlen)] = '\0'; return do_tcp_getsockopt(sk, SOL_TCP, TCP_CONGESTION, KERNEL_SOCKPTR(optval), KERNEL_SOCKPTR(optlen)); } /* "cdg" is the only cc that alloc a ptr * in inet_csk_ca area. The bpf-tcp-cc may * overwrite this ptr after switching to cdg. */ if (*optlen >= sizeof("cdg") - 1 && !strncmp("cdg", optval, *optlen)) return -ENOTSUPP; /* It stops this looping * * .init => bpf_setsockopt(tcp_cc) => .init => * bpf_setsockopt(tcp_cc)" => .init => .... * * The second bpf_setsockopt(tcp_cc) is not allowed * in order to break the loop when both .init * are the same bpf prog. * * This applies even the second bpf_setsockopt(tcp_cc) * does not cause a loop. This limits only the first * '.init' can call bpf_setsockopt(TCP_CONGESTION) to * pick a fallback cc (eg. peer does not support ECN) * and the second '.init' cannot fallback to * another. */ tp = tcp_sk(sk); if (tp->bpf_chg_cc_inprogress) return -EBUSY; tp->bpf_chg_cc_inprogress = 1; ret = do_tcp_setsockopt(sk, SOL_TCP, TCP_CONGESTION, KERNEL_SOCKPTR(optval), *optlen); tp->bpf_chg_cc_inprogress = 0; return ret; } static int sol_tcp_sockopt(struct sock *sk, int optname, char *optval, int *optlen, bool getopt) { if (sk->sk_protocol != IPPROTO_TCP) return -EINVAL; switch (optname) { case TCP_NODELAY: case TCP_MAXSEG: case TCP_KEEPIDLE: case TCP_KEEPINTVL: case TCP_KEEPCNT: case TCP_SYNCNT: case TCP_WINDOW_CLAMP: case TCP_THIN_LINEAR_TIMEOUTS: case TCP_USER_TIMEOUT: case TCP_NOTSENT_LOWAT: case TCP_SAVE_SYN: case TCP_RTO_MAX_MS: if (*optlen != sizeof(int)) return -EINVAL; break; case TCP_CONGESTION: return sol_tcp_sockopt_congestion(sk, optval, optlen, getopt); case TCP_SAVED_SYN: if (*optlen < 1) return -EINVAL; break; default: if (getopt) return bpf_sol_tcp_getsockopt(sk, optname, optval, *optlen); return bpf_sol_tcp_setsockopt(sk, optname, optval, *optlen); } if (getopt) { if (optname == TCP_SAVED_SYN) { struct tcp_sock *tp = tcp_sk(sk); if (!tp->saved_syn || *optlen > tcp_saved_syn_len(tp->saved_syn)) return -EINVAL; memcpy(optval, tp->saved_syn->data, *optlen); /* It cannot free tp->saved_syn here because it * does not know if the user space still needs it. */ return 0; } return do_tcp_getsockopt(sk, SOL_TCP, optname, KERNEL_SOCKPTR(optval), KERNEL_SOCKPTR(optlen)); } return do_tcp_setsockopt(sk, SOL_TCP, optname, KERNEL_SOCKPTR(optval), *optlen); } static int sol_ip_sockopt(struct sock *sk, int optname, char *optval, int *optlen, bool getopt) { if (sk->sk_family != AF_INET) return -EINVAL; switch (optname) { case IP_TOS: if (*optlen != sizeof(int)) return -EINVAL; break; default: return -EINVAL; } if (getopt) return do_ip_getsockopt(sk, SOL_IP, optname, KERNEL_SOCKPTR(optval), KERNEL_SOCKPTR(optlen)); return do_ip_setsockopt(sk, SOL_IP, optname, KERNEL_SOCKPTR(optval), *optlen); } static int sol_ipv6_sockopt(struct sock *sk, int optname, char *optval, int *optlen, bool getopt) { if (sk->sk_family != AF_INET6) return -EINVAL; switch (optname) { case IPV6_TCLASS: case IPV6_AUTOFLOWLABEL: if (*optlen != sizeof(int)) return -EINVAL; break; default: return -EINVAL; } if (getopt) return ipv6_bpf_stub->ipv6_getsockopt(sk, SOL_IPV6, optname, KERNEL_SOCKPTR(optval), KERNEL_SOCKPTR(optlen)); return ipv6_bpf_stub->ipv6_setsockopt(sk, SOL_IPV6, optname, KERNEL_SOCKPTR(optval), *optlen); } static int __bpf_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen) { if (!sk_fullsock(sk)) return -EINVAL; if (level == SOL_SOCKET) return sol_socket_sockopt(sk, optname, optval, &optlen, false); else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP) return sol_ip_sockopt(sk, optname, optval, &optlen, false); else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6) return sol_ipv6_sockopt(sk, optname, optval, &optlen, false); else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP) return sol_tcp_sockopt(sk, optname, optval, &optlen, false); return -EINVAL; } static bool is_locked_tcp_sock_ops(struct bpf_sock_ops_kern *bpf_sock) { return bpf_sock->op <= BPF_SOCK_OPS_WRITE_HDR_OPT_CB; } static int _bpf_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen) { if (sk_fullsock(sk)) sock_owned_by_me(sk); return __bpf_setsockopt(sk, level, optname, optval, optlen); } static int __bpf_getsockopt(struct sock *sk, int level, int optname, char *optval, int optlen) { int err, saved_optlen = optlen; if (!sk_fullsock(sk)) { err = -EINVAL; goto done; } if (level == SOL_SOCKET) err = sol_socket_sockopt(sk, optname, optval, &optlen, true); else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP) err = sol_tcp_sockopt(sk, optname, optval, &optlen, true); else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP) err = sol_ip_sockopt(sk, optname, optval, &optlen, true); else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6) err = sol_ipv6_sockopt(sk, optname, optval, &optlen, true); else err = -EINVAL; done: if (err) optlen = 0; if (optlen < saved_optlen) memset(optval + optlen, 0, saved_optlen - optlen); return err; } static int _bpf_getsockopt(struct sock *sk, int level, int optname, char *optval, int optlen) { if (sk_fullsock(sk)) sock_owned_by_me(sk); return __bpf_getsockopt(sk, level, optname, optval, optlen); } BPF_CALL_5(bpf_sk_setsockopt, struct sock *, sk, int, level, int, optname, char *, optval, int, optlen) { return _bpf_setsockopt(sk, level, optname, optval, optlen); } const struct bpf_func_proto bpf_sk_setsockopt_proto = { .func = bpf_sk_setsockopt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, }; BPF_CALL_5(bpf_sk_getsockopt, struct sock *, sk, int, level, int, optname, char *, optval, int, optlen) { return _bpf_getsockopt(sk, level, optname, optval, optlen); } const struct bpf_func_proto bpf_sk_getsockopt_proto = { .func = bpf_sk_getsockopt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_UNINIT_MEM, .arg5_type = ARG_CONST_SIZE, }; BPF_CALL_5(bpf_unlocked_sk_setsockopt, struct sock *, sk, int, level, int, optname, char *, optval, int, optlen) { return __bpf_setsockopt(sk, level, optname, optval, optlen); } const struct bpf_func_proto bpf_unlocked_sk_setsockopt_proto = { .func = bpf_unlocked_sk_setsockopt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, }; BPF_CALL_5(bpf_unlocked_sk_getsockopt, struct sock *, sk, int, level, int, optname, char *, optval, int, optlen) { return __bpf_getsockopt(sk, level, optname, optval, optlen); } const struct bpf_func_proto bpf_unlocked_sk_getsockopt_proto = { .func = bpf_unlocked_sk_getsockopt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_UNINIT_MEM, .arg5_type = ARG_CONST_SIZE, }; BPF_CALL_5(bpf_sock_addr_setsockopt, struct bpf_sock_addr_kern *, ctx, int, level, int, optname, char *, optval, int, optlen) { return _bpf_setsockopt(ctx->sk, level, optname, optval, optlen); } static const struct bpf_func_proto bpf_sock_addr_setsockopt_proto = { .func = bpf_sock_addr_setsockopt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, }; BPF_CALL_5(bpf_sock_addr_getsockopt, struct bpf_sock_addr_kern *, ctx, int, level, int, optname, char *, optval, int, optlen) { return _bpf_getsockopt(ctx->sk, level, optname, optval, optlen); } static const struct bpf_func_proto bpf_sock_addr_getsockopt_proto = { .func = bpf_sock_addr_getsockopt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_UNINIT_MEM, .arg5_type = ARG_CONST_SIZE, }; static int sk_bpf_set_get_bypass_prot_mem(struct sock *sk, char *optval, int optlen, bool getopt) { int val; if (optlen != sizeof(int)) return -EINVAL; if (!sk_has_account(sk)) return -EOPNOTSUPP; if (getopt) { *(int *)optval = sk->sk_bypass_prot_mem; return 0; } val = *(int *)optval; if (val < 0 || val > 1) return -EINVAL; sk->sk_bypass_prot_mem = val; return 0; } BPF_CALL_5(bpf_sock_create_setsockopt, struct sock *, sk, int, level, int, optname, char *, optval, int, optlen) { if (level == SOL_SOCKET && optname == SK_BPF_BYPASS_PROT_MEM) return sk_bpf_set_get_bypass_prot_mem(sk, optval, optlen, false); return __bpf_setsockopt(sk, level, optname, optval, optlen); } static const struct bpf_func_proto bpf_sock_create_setsockopt_proto = { .func = bpf_sock_create_setsockopt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, }; BPF_CALL_5(bpf_sock_create_getsockopt, struct sock *, sk, int, level, int, optname, char *, optval, int, optlen) { if (level == SOL_SOCKET && optname == SK_BPF_BYPASS_PROT_MEM) { int err = sk_bpf_set_get_bypass_prot_mem(sk, optval, optlen, true); if (err) memset(optval, 0, optlen); return err; } return __bpf_getsockopt(sk, level, optname, optval, optlen); } static const struct bpf_func_proto bpf_sock_create_getsockopt_proto = { .func = bpf_sock_create_getsockopt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_UNINIT_MEM, .arg5_type = ARG_CONST_SIZE, }; BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { if (!is_locked_tcp_sock_ops(bpf_sock)) return -EOPNOTSUPP; return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen); } static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = { .func = bpf_sock_ops_setsockopt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, }; static int bpf_sock_ops_get_syn(struct bpf_sock_ops_kern *bpf_sock, int optname, const u8 **start) { struct sk_buff *syn_skb = bpf_sock->syn_skb; const u8 *hdr_start; int ret; if (syn_skb) { /* sk is a request_sock here */ if (optname == TCP_BPF_SYN) { hdr_start = syn_skb->data; ret = tcp_hdrlen(syn_skb); } else if (optname == TCP_BPF_SYN_IP) { hdr_start = skb_network_header(syn_skb); ret = skb_network_header_len(syn_skb) + tcp_hdrlen(syn_skb); } else { /* optname == TCP_BPF_SYN_MAC */ hdr_start = skb_mac_header(syn_skb); ret = skb_mac_header_len(syn_skb) + skb_network_header_len(syn_skb) + tcp_hdrlen(syn_skb); } } else { struct sock *sk = bpf_sock->sk; struct saved_syn *saved_syn; if (sk->sk_state == TCP_NEW_SYN_RECV) /* synack retransmit. bpf_sock->syn_skb will * not be available. It has to resort to * saved_syn (if it is saved). */ saved_syn = inet_reqsk(sk)->saved_syn; else saved_syn = tcp_sk(sk)->saved_syn; if (!saved_syn) return -ENOENT; if (optname == TCP_BPF_SYN) { hdr_start = saved_syn->data + saved_syn->mac_hdrlen + saved_syn->network_hdrlen; ret = saved_syn->tcp_hdrlen; } else if (optname == TCP_BPF_SYN_IP) { hdr_start = saved_syn->data + saved_syn->mac_hdrlen; ret = saved_syn->network_hdrlen + saved_syn->tcp_hdrlen; } else { /* optname == TCP_BPF_SYN_MAC */ /* TCP_SAVE_SYN may not have saved the mac hdr */ if (!saved_syn->mac_hdrlen) return -ENOENT; hdr_start = saved_syn->data; ret = saved_syn->mac_hdrlen + saved_syn->network_hdrlen + saved_syn->tcp_hdrlen; } } *start = hdr_start; return ret; } BPF_CALL_5(bpf_sock_ops_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { if (!is_locked_tcp_sock_ops(bpf_sock)) return -EOPNOTSUPP; if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP && optname >= TCP_BPF_SYN && optname <= TCP_BPF_SYN_MAC) { int ret, copy_len = 0; const u8 *start; ret = bpf_sock_ops_get_syn(bpf_sock, optname, &start); if (ret > 0) { copy_len = ret; if (optlen < copy_len) { copy_len = optlen; ret = -ENOSPC; } memcpy(optval, start, copy_len); } /* Zero out unused buffer at the end */ memset(optval + copy_len, 0, optlen - copy_len); return ret; } return _bpf_getsockopt(bpf_sock->sk, level, optname, optval, optlen); } static const struct bpf_func_proto bpf_sock_ops_getsockopt_proto = { .func = bpf_sock_ops_getsockopt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_UNINIT_MEM, .arg5_type = ARG_CONST_SIZE, }; BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock, int, argval) { struct sock *sk = bpf_sock->sk; int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS; if (!is_locked_tcp_sock_ops(bpf_sock)) return -EOPNOTSUPP; if (!IS_ENABLED(CONFIG_INET) || !sk_fullsock(sk)) return -EINVAL; tcp_sk(sk)->bpf_sock_ops_cb_flags = val; return argval & (~BPF_SOCK_OPS_ALL_CB_FLAGS); } static const struct bpf_func_proto bpf_sock_ops_cb_flags_set_proto = { .func = bpf_sock_ops_cb_flags_set, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly; EXPORT_SYMBOL_GPL(ipv6_bpf_stub); BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr, int, addr_len) { #ifdef CONFIG_INET struct sock *sk = ctx->sk; u32 flags = BIND_FROM_BPF; int err; err = -EINVAL; if (addr_len < offsetofend(struct sockaddr, sa_family)) return err; if (addr->sa_family == AF_INET) { if (addr_len < sizeof(struct sockaddr_in)) return err; if (((struct sockaddr_in *)addr)->sin_port == htons(0)) flags |= BIND_FORCE_ADDRESS_NO_PORT; return __inet_bind(sk, (struct sockaddr_unsized *)addr, addr_len, flags); #if IS_ENABLED(CONFIG_IPV6) } else if (addr->sa_family == AF_INET6) { if (addr_len < SIN6_LEN_RFC2133) return err; if (((struct sockaddr_in6 *)addr)->sin6_port == htons(0)) flags |= BIND_FORCE_ADDRESS_NO_PORT; /* ipv6_bpf_stub cannot be NULL, since it's called from * bpf_cgroup_inet6_connect hook and ipv6 is already loaded */ return ipv6_bpf_stub->inet6_bind(sk, (struct sockaddr_unsized *)addr, addr_len, flags); #endif /* CONFIG_IPV6 */ } #endif /* CONFIG_INET */ return -EAFNOSUPPORT; } static const struct bpf_func_proto bpf_bind_proto = { .func = bpf_bind, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, }; #ifdef CONFIG_XFRM #if (IS_BUILTIN(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \ (IS_MODULE(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) struct metadata_dst __percpu *xfrm_bpf_md_dst; EXPORT_SYMBOL_GPL(xfrm_bpf_md_dst); #endif BPF_CALL_5(bpf_skb_get_xfrm_state, struct sk_buff *, skb, u32, index, struct bpf_xfrm_state *, to, u32, size, u64, flags) { const struct sec_path *sp = skb_sec_path(skb); const struct xfrm_state *x; if (!sp || unlikely(index >= sp->len || flags)) goto err_clear; x = sp->xvec[index]; if (unlikely(size != sizeof(struct bpf_xfrm_state))) goto err_clear; to->reqid = x->props.reqid; to->spi = x->id.spi; to->family = x->props.family; to->ext = 0; if (to->family == AF_INET6) { memcpy(to->remote_ipv6, x->props.saddr.a6, sizeof(to->remote_ipv6)); } else { to->remote_ipv4 = x->props.saddr.a4; memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3); } return 0; err_clear: memset(to, 0, size); return -EINVAL; } static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = { .func = bpf_skb_get_xfrm_state, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, .arg5_type = ARG_ANYTHING, }; #endif #if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6) static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params, u32 mtu) { params->h_vlan_TCI = 0; params->h_vlan_proto = 0; if (mtu) params->mtu_result = mtu; /* union with tot_len */ return 0; } #endif #if IS_ENABLED(CONFIG_INET) static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, u32 flags, bool check_mtu) { struct fib_nh_common *nhc; struct in_device *in_dev; struct neighbour *neigh; struct net_device *dev; struct fib_result res; struct flowi4 fl4; u32 mtu = 0; int err; dev = dev_get_by_index_rcu(net, params->ifindex); if (unlikely(!dev)) return -ENODEV; /* verify forwarding is enabled on this interface */ in_dev = __in_dev_get_rcu(dev); if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev))) return BPF_FIB_LKUP_RET_FWD_DISABLED; if (flags & BPF_FIB_LOOKUP_OUTPUT) { fl4.flowi4_iif = 1; fl4.flowi4_oif = params->ifindex; } else { fl4.flowi4_iif = params->ifindex; fl4.flowi4_oif = 0; } fl4.flowi4_dscp = inet_dsfield_to_dscp(params->tos); fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.flowi4_flags = 0; fl4.flowi4_proto = params->l4_protocol; fl4.daddr = params->ipv4_dst; fl4.saddr = params->ipv4_src; fl4.fl4_sport = params->sport; fl4.fl4_dport = params->dport; fl4.flowi4_multipath_hash = 0; if (flags & BPF_FIB_LOOKUP_DIRECT) { u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; struct fib_table *tb; if (flags & BPF_FIB_LOOKUP_TBID) { tbid = params->tbid; /* zero out for vlan output */ params->tbid = 0; } tb = fib_get_table(net, tbid); if (unlikely(!tb)) return BPF_FIB_LKUP_RET_NOT_FWDED; err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF); } else { if (flags & BPF_FIB_LOOKUP_MARK) fl4.flowi4_mark = params->mark; else fl4.flowi4_mark = 0; fl4.flowi4_secid = 0; fl4.flowi4_tun_key.tun_id = 0; fl4.flowi4_uid = sock_net_uid(net, NULL); err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF); } if (err) { /* map fib lookup errors to RTN_ type */ if (err == -EINVAL) return BPF_FIB_LKUP_RET_BLACKHOLE; if (err == -EHOSTUNREACH) return BPF_FIB_LKUP_RET_UNREACHABLE; if (err == -EACCES) return BPF_FIB_LKUP_RET_PROHIBIT; return BPF_FIB_LKUP_RET_NOT_FWDED; } if (res.type != RTN_UNICAST) return BPF_FIB_LKUP_RET_NOT_FWDED; if (fib_info_num_path(res.fi) > 1) fib_select_path(net, &res, &fl4, NULL); if (check_mtu) { mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst); if (params->tot_len > mtu) { params->mtu_result = mtu; /* union with tot_len */ return BPF_FIB_LKUP_RET_FRAG_NEEDED; } } nhc = res.nhc; /* do not handle lwt encaps right now */ if (nhc->nhc_lwtstate) return BPF_FIB_LKUP_RET_UNSUPP_LWT; dev = nhc->nhc_dev; params->rt_metric = res.fi->fib_priority; params->ifindex = dev->ifindex; if (flags & BPF_FIB_LOOKUP_SRC) params->ipv4_src = fib_result_prefsrc(net, &res); /* xdp and cls_bpf programs are run in RCU-bh so * rcu_read_lock_bh is not needed here */ if (likely(nhc->nhc_gw_family != AF_INET6)) { if (nhc->nhc_gw_family) params->ipv4_dst = nhc->nhc_gw.ipv4; } else { struct in6_addr *dst = (struct in6_addr *)params->ipv6_dst; params->family = AF_INET6; *dst = nhc->nhc_gw.ipv6; } if (flags & BPF_FIB_LOOKUP_SKIP_NEIGH) goto set_fwd_params; if (likely(nhc->nhc_gw_family != AF_INET6)) neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst); else neigh = __ipv6_neigh_lookup_noref_stub(dev, params->ipv6_dst); if (!neigh || !(READ_ONCE(neigh->nud_state) & NUD_VALID)) return BPF_FIB_LKUP_RET_NO_NEIGH; memcpy(params->dmac, neigh->ha, ETH_ALEN); memcpy(params->smac, dev->dev_addr, ETH_ALEN); set_fwd_params: return bpf_fib_set_fwd_params(params, mtu); } #endif #if IS_ENABLED(CONFIG_IPV6) static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, u32 flags, bool check_mtu) { struct in6_addr *src = (struct in6_addr *) params->ipv6_src; struct in6_addr *dst = (struct in6_addr *) params->ipv6_dst; struct fib6_result res = {}; struct neighbour *neigh; struct net_device *dev; struct inet6_dev *idev; struct flowi6 fl6; int strict = 0; int oif, err; u32 mtu = 0; /* link local addresses are never forwarded */ if (rt6_need_strict(dst) || rt6_need_strict(src)) return BPF_FIB_LKUP_RET_NOT_FWDED; dev = dev_get_by_index_rcu(net, params->ifindex); if (unlikely(!dev)) return -ENODEV; idev = __in6_dev_get_safely(dev); if (unlikely(!idev || !READ_ONCE(idev->cnf.forwarding))) return BPF_FIB_LKUP_RET_FWD_DISABLED; if (flags & BPF_FIB_LOOKUP_OUTPUT) { fl6.flowi6_iif = 1; oif = fl6.flowi6_oif = params->ifindex; } else { oif = fl6.flowi6_iif = params->ifindex; fl6.flowi6_oif = 0; strict = RT6_LOOKUP_F_HAS_SADDR; } fl6.flowlabel = params->flowinfo; fl6.flowi6_scope = 0; fl6.flowi6_flags = 0; fl6.mp_hash = 0; fl6.flowi6_proto = params->l4_protocol; fl6.daddr = *dst; fl6.saddr = *src; fl6.fl6_sport = params->sport; fl6.fl6_dport = params->dport; if (flags & BPF_FIB_LOOKUP_DIRECT) { u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; struct fib6_table *tb; if (flags & BPF_FIB_LOOKUP_TBID) { tbid = params->tbid; /* zero out for vlan output */ params->tbid = 0; } tb = ipv6_stub->fib6_get_table(net, tbid); if (unlikely(!tb)) return BPF_FIB_LKUP_RET_NOT_FWDED; err = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, &res, strict); } else { if (flags & BPF_FIB_LOOKUP_MARK) fl6.flowi6_mark = params->mark; else fl6.flowi6_mark = 0; fl6.flowi6_secid = 0; fl6.flowi6_tun_key.tun_id = 0; fl6.flowi6_uid = sock_net_uid(net, NULL); err = ipv6_stub->fib6_lookup(net, oif, &fl6, &res, strict); } if (unlikely(err || IS_ERR_OR_NULL(res.f6i) || res.f6i == net->ipv6.fib6_null_entry)) return BPF_FIB_LKUP_RET_NOT_FWDED; switch (res.fib6_type) { /* only unicast is forwarded */ case RTN_UNICAST: break; case RTN_BLACKHOLE: return BPF_FIB_LKUP_RET_BLACKHOLE; case RTN_UNREACHABLE: return BPF_FIB_LKUP_RET_UNREACHABLE; case RTN_PROHIBIT: return BPF_FIB_LKUP_RET_PROHIBIT; default: return BPF_FIB_LKUP_RET_NOT_FWDED; } ipv6_stub->fib6_select_path(net, &res, &fl6, fl6.flowi6_oif, fl6.flowi6_oif != 0, NULL, strict); if (check_mtu) { mtu = ipv6_stub->ip6_mtu_from_fib6(&res, dst, src); if (params->tot_len > mtu) { params->mtu_result = mtu; /* union with tot_len */ return BPF_FIB_LKUP_RET_FRAG_NEEDED; } } if (res.nh->fib_nh_lws) return BPF_FIB_LKUP_RET_UNSUPP_LWT; if (res.nh->fib_nh_gw_family) *dst = res.nh->fib_nh_gw6; dev = res.nh->fib_nh_dev; params->rt_metric = res.f6i->fib6_metric; params->ifindex = dev->ifindex; if (flags & BPF_FIB_LOOKUP_SRC) { if (res.f6i->fib6_prefsrc.plen) { *src = res.f6i->fib6_prefsrc.addr; } else { err = ipv6_bpf_stub->ipv6_dev_get_saddr(net, dev, &fl6.daddr, 0, src); if (err) return BPF_FIB_LKUP_RET_NO_SRC_ADDR; } } if (flags & BPF_FIB_LOOKUP_SKIP_NEIGH) goto set_fwd_params; /* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is * not needed here. */ neigh = __ipv6_neigh_lookup_noref_stub(dev, dst); if (!neigh || !(READ_ONCE(neigh->nud_state) & NUD_VALID)) return BPF_FIB_LKUP_RET_NO_NEIGH; memcpy(params->dmac, neigh->ha, ETH_ALEN); memcpy(params->smac, dev->dev_addr, ETH_ALEN); set_fwd_params: return bpf_fib_set_fwd_params(params, mtu); } #endif #define BPF_FIB_LOOKUP_MASK (BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT | \ BPF_FIB_LOOKUP_SKIP_NEIGH | BPF_FIB_LOOKUP_TBID | \ BPF_FIB_LOOKUP_SRC | BPF_FIB_LOOKUP_MARK) BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx, struct bpf_fib_lookup *, params, int, plen, u32, flags) { if (plen < sizeof(*params)) return -EINVAL; if (flags & ~BPF_FIB_LOOKUP_MASK) return -EINVAL; switch (params->family) { #if IS_ENABLED(CONFIG_INET) case AF_INET: return bpf_ipv4_fib_lookup(dev_net(ctx->rxq->dev), params, flags, true); #endif #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: return bpf_ipv6_fib_lookup(dev_net(ctx->rxq->dev), params, flags, true); #endif } return -EAFNOSUPPORT; } static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = { .func = bpf_xdp_fib_lookup, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb, struct bpf_fib_lookup *, params, int, plen, u32, flags) { struct net *net = dev_net(skb->dev); int rc = -EAFNOSUPPORT; bool check_mtu = false; if (plen < sizeof(*params)) return -EINVAL; if (flags & ~BPF_FIB_LOOKUP_MASK) return -EINVAL; if (params->tot_len) check_mtu = true; switch (params->family) { #if IS_ENABLED(CONFIG_INET) case AF_INET: rc = bpf_ipv4_fib_lookup(net, params, flags, check_mtu); break; #endif #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: rc = bpf_ipv6_fib_lookup(net, params, flags, check_mtu); break; #endif } if (rc == BPF_FIB_LKUP_RET_SUCCESS && !check_mtu) { struct net_device *dev; /* When tot_len isn't provided by user, check skb * against MTU of FIB lookup resulting net_device */ dev = dev_get_by_index_rcu(net, params->ifindex); if (!is_skb_forwardable(dev, skb)) rc = BPF_FIB_LKUP_RET_FRAG_NEEDED; params->mtu_result = dev->mtu; /* union with tot_len */ } return rc; } static const struct bpf_func_proto bpf_skb_fib_lookup_proto = { .func = bpf_skb_fib_lookup, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; static struct net_device *__dev_via_ifindex(struct net_device *dev_curr, u32 ifindex) { struct net *netns = dev_net(dev_curr); /* Non-redirect use-cases can use ifindex=0 and save ifindex lookup */ if (ifindex == 0) return dev_curr; return dev_get_by_index_rcu(netns, ifindex); } BPF_CALL_5(bpf_skb_check_mtu, struct sk_buff *, skb, u32, ifindex, u32 *, mtu_len, s32, len_diff, u64, flags) { int ret = BPF_MTU_CHK_RET_FRAG_NEEDED; struct net_device *dev = skb->dev; int mtu, dev_len, skb_len; if (unlikely(flags & ~(BPF_MTU_CHK_SEGS))) return -EINVAL; if (unlikely(flags & BPF_MTU_CHK_SEGS && (len_diff || *mtu_len))) return -EINVAL; dev = __dev_via_ifindex(dev, ifindex); if (unlikely(!dev)) return -ENODEV; mtu = READ_ONCE(dev->mtu); dev_len = mtu + dev->hard_header_len; /* If set use *mtu_len as input, L3 as iph->tot_len (like fib_lookup) */ skb_len = *mtu_len ? *mtu_len + dev->hard_header_len : skb->len; skb_len += len_diff; /* minus result pass check */ if (skb_len <= dev_len) { ret = BPF_MTU_CHK_RET_SUCCESS; goto out; } /* At this point, skb->len exceed MTU, but as it include length of all * segments, it can still be below MTU. The SKB can possibly get * re-segmented in transmit path (see validate_xmit_skb). Thus, user * must choose if segs are to be MTU checked. */ if (skb_is_gso(skb)) { ret = BPF_MTU_CHK_RET_SUCCESS; if (flags & BPF_MTU_CHK_SEGS) { if (!skb_transport_header_was_set(skb)) return -EINVAL; if (!skb_gso_validate_network_len(skb, mtu)) ret = BPF_MTU_CHK_RET_SEGS_TOOBIG; } } out: *mtu_len = mtu; return ret; } BPF_CALL_5(bpf_xdp_check_mtu, struct xdp_buff *, xdp, u32, ifindex, u32 *, mtu_len, s32, len_diff, u64, flags) { struct net_device *dev = xdp->rxq->dev; int xdp_len = xdp->data_end - xdp->data; int ret = BPF_MTU_CHK_RET_SUCCESS; int mtu, dev_len; /* XDP variant doesn't support multi-buffer segment check (yet) */ if (unlikely(flags)) return -EINVAL; dev = __dev_via_ifindex(dev, ifindex); if (unlikely(!dev)) return -ENODEV; mtu = READ_ONCE(dev->mtu); dev_len = mtu + dev->hard_header_len; /* Use *mtu_len as input, L3 as iph->tot_len (like fib_lookup) */ if (*mtu_len) xdp_len = *mtu_len + dev->hard_header_len; xdp_len += len_diff; /* minus result pass check */ if (xdp_len > dev_len) ret = BPF_MTU_CHK_RET_FRAG_NEEDED; *mtu_len = mtu; return ret; } static const struct bpf_func_proto bpf_skb_check_mtu_proto = { .func = bpf_skb_check_mtu, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_WRITE | MEM_ALIGNED, .arg3_size = sizeof(u32), .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; static const struct bpf_func_proto bpf_xdp_check_mtu_proto = { .func = bpf_xdp_check_mtu, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_WRITE | MEM_ALIGNED, .arg3_size = sizeof(u32), .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len) { int err; struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)hdr; if (!seg6_validate_srh(srh, len, false)) return -EINVAL; switch (type) { case BPF_LWT_ENCAP_SEG6_INLINE: if (skb->protocol != htons(ETH_P_IPV6)) return -EBADMSG; err = seg6_do_srh_inline(skb, srh); break; case BPF_LWT_ENCAP_SEG6: skb_reset_inner_headers(skb); skb->encapsulation = 1; err = seg6_do_srh_encap(skb, srh, IPPROTO_IPV6); break; default: return -EINVAL; } bpf_compute_data_pointers(skb); if (err) return err; skb_set_transport_header(skb, sizeof(struct ipv6hdr)); return seg6_lookup_nexthop(skb, NULL, 0); } #endif /* CONFIG_IPV6_SEG6_BPF */ #if IS_ENABLED(CONFIG_LWTUNNEL_BPF) static int bpf_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress) { return bpf_lwt_push_ip_encap(skb, hdr, len, ingress); } #endif BPF_CALL_4(bpf_lwt_in_push_encap, struct sk_buff *, skb, u32, type, void *, hdr, u32, len) { switch (type) { #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) case BPF_LWT_ENCAP_SEG6: case BPF_LWT_ENCAP_SEG6_INLINE: return bpf_push_seg6_encap(skb, type, hdr, len); #endif #if IS_ENABLED(CONFIG_LWTUNNEL_BPF) case BPF_LWT_ENCAP_IP: return bpf_push_ip_encap(skb, hdr, len, true /* ingress */); #endif default: return -EINVAL; } } BPF_CALL_4(bpf_lwt_xmit_push_encap, struct sk_buff *, skb, u32, type, void *, hdr, u32, len) { switch (type) { #if IS_ENABLED(CONFIG_LWTUNNEL_BPF) case BPF_LWT_ENCAP_IP: return bpf_push_ip_encap(skb, hdr, len, false /* egress */); #endif default: return -EINVAL; } } static const struct bpf_func_proto bpf_lwt_in_push_encap_proto = { .func = bpf_lwt_in_push_encap, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE }; static const struct bpf_func_proto bpf_lwt_xmit_push_encap_proto = { .func = bpf_lwt_xmit_push_encap, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE }; #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset, const void *, from, u32, len) { struct seg6_bpf_srh_state *srh_state = this_cpu_ptr(&seg6_bpf_srh_states); struct ipv6_sr_hdr *srh = srh_state->srh; void *srh_tlvs, *srh_end, *ptr; int srhoff = 0; lockdep_assert_held(&srh_state->bh_lock); if (srh == NULL) return -EINVAL; srh_tlvs = (void *)((char *)srh + ((srh->first_segment + 1) << 4)); srh_end = (void *)((char *)srh + sizeof(*srh) + srh_state->hdrlen); ptr = skb->data + offset; if (ptr >= srh_tlvs && ptr + len <= srh_end) srh_state->valid = false; else if (ptr < (void *)&srh->flags || ptr + len > (void *)&srh->segments) return -EFAULT; if (unlikely(bpf_try_make_writable(skb, offset + len))) return -EFAULT; if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) return -EINVAL; srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); memcpy(skb->data + offset, from, len); return 0; } static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = { .func = bpf_lwt_seg6_store_bytes, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE }; static void bpf_update_srh_state(struct sk_buff *skb) { struct seg6_bpf_srh_state *srh_state = this_cpu_ptr(&seg6_bpf_srh_states); int srhoff = 0; if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) { srh_state->srh = NULL; } else { srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); srh_state->hdrlen = srh_state->srh->hdrlen << 3; srh_state->valid = true; } } BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb, u32, action, void *, param, u32, param_len) { struct seg6_bpf_srh_state *srh_state = this_cpu_ptr(&seg6_bpf_srh_states); int hdroff = 0; int err; lockdep_assert_held(&srh_state->bh_lock); switch (action) { case SEG6_LOCAL_ACTION_END_X: if (!seg6_bpf_has_valid_srh(skb)) return -EBADMSG; if (param_len != sizeof(struct in6_addr)) return -EINVAL; return seg6_lookup_nexthop(skb, (struct in6_addr *)param, 0); case SEG6_LOCAL_ACTION_END_T: if (!seg6_bpf_has_valid_srh(skb)) return -EBADMSG; if (param_len != sizeof(int)) return -EINVAL; return seg6_lookup_nexthop(skb, NULL, *(int *)param); case SEG6_LOCAL_ACTION_END_DT6: if (!seg6_bpf_has_valid_srh(skb)) return -EBADMSG; if (param_len != sizeof(int)) return -EINVAL; if (ipv6_find_hdr(skb, &hdroff, IPPROTO_IPV6, NULL, NULL) < 0) return -EBADMSG; if (!pskb_pull(skb, hdroff)) return -EBADMSG; skb_postpull_rcsum(skb, skb_network_header(skb), hdroff); skb_reset_network_header(skb); skb_reset_transport_header(skb); skb->encapsulation = 0; bpf_compute_data_pointers(skb); bpf_update_srh_state(skb); return seg6_lookup_nexthop(skb, NULL, *(int *)param); case SEG6_LOCAL_ACTION_END_B6: if (srh_state->srh && !seg6_bpf_has_valid_srh(skb)) return -EBADMSG; err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6_INLINE, param, param_len); if (!err) bpf_update_srh_state(skb); return err; case SEG6_LOCAL_ACTION_END_B6_ENCAP: if (srh_state->srh && !seg6_bpf_has_valid_srh(skb)) return -EBADMSG; err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6, param, param_len); if (!err) bpf_update_srh_state(skb); return err; default: return -EINVAL; } } static const struct bpf_func_proto bpf_lwt_seg6_action_proto = { .func = bpf_lwt_seg6_action, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE }; BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset, s32, len) { struct seg6_bpf_srh_state *srh_state = this_cpu_ptr(&seg6_bpf_srh_states); struct ipv6_sr_hdr *srh = srh_state->srh; void *srh_end, *srh_tlvs, *ptr; struct ipv6hdr *hdr; int srhoff = 0; int ret; lockdep_assert_held(&srh_state->bh_lock); if (unlikely(srh == NULL)) return -EINVAL; srh_tlvs = (void *)((unsigned char *)srh + sizeof(*srh) + ((srh->first_segment + 1) << 4)); srh_end = (void *)((unsigned char *)srh + sizeof(*srh) + srh_state->hdrlen); ptr = skb->data + offset; if (unlikely(ptr < srh_tlvs || ptr > srh_end)) return -EFAULT; if (unlikely(len < 0 && (void *)((char *)ptr - len) > srh_end)) return -EFAULT; if (len > 0) { ret = skb_cow_head(skb, len); if (unlikely(ret < 0)) return ret; ret = bpf_skb_net_hdr_push(skb, offset, len); } else { ret = bpf_skb_net_hdr_pop(skb, offset, -1 * len); } bpf_compute_data_pointers(skb); if (unlikely(ret < 0)) return ret; hdr = (struct ipv6hdr *)skb->data; hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) return -EINVAL; srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); srh_state->hdrlen += len; srh_state->valid = false; return 0; } static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { .func = bpf_lwt_seg6_adjust_srh, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; #endif /* CONFIG_IPV6_SEG6_BPF */ #ifdef CONFIG_INET static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, int dif, int sdif, u8 family, u8 proto) { bool refcounted = false; struct sock *sk = NULL; if (family == AF_INET) { __be32 src4 = tuple->ipv4.saddr; __be32 dst4 = tuple->ipv4.daddr; if (proto == IPPROTO_TCP) sk = __inet_lookup(net, NULL, 0, src4, tuple->ipv4.sport, dst4, tuple->ipv4.dport, dif, sdif, &refcounted); else sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport, dst4, tuple->ipv4.dport, dif, sdif, net->ipv4.udp_table, NULL); #if IS_ENABLED(CONFIG_IPV6) } else { struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr; struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr; if (proto == IPPROTO_TCP) sk = __inet6_lookup(net, NULL, 0, src6, tuple->ipv6.sport, dst6, ntohs(tuple->ipv6.dport), dif, sdif, &refcounted); else if (likely(ipv6_bpf_stub)) sk = ipv6_bpf_stub->udp6_lib_lookup(net, src6, tuple->ipv6.sport, dst6, tuple->ipv6.dport, dif, sdif, net->ipv4.udp_table, NULL); #endif } if (unlikely(sk && !refcounted && !sock_flag(sk, SOCK_RCU_FREE))) { WARN_ONCE(1, "Found non-RCU, unreferenced socket!"); sk = NULL; } return sk; } /* bpf_skc_lookup performs the core lookup for different types of sockets, * taking a reference on the socket if it doesn't have the flag SOCK_RCU_FREE. */ static struct sock * __bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id, u64 flags, int sdif) { struct sock *sk = NULL; struct net *net; u8 family; if (len == sizeof(tuple->ipv4)) family = AF_INET; else if (len == sizeof(tuple->ipv6)) family = AF_INET6; else return NULL; if (unlikely(flags || !((s32)netns_id < 0 || netns_id <= S32_MAX))) goto out; if (sdif < 0) { if (family == AF_INET) sdif = inet_sdif(skb); else sdif = inet6_sdif(skb); } if ((s32)netns_id < 0) { net = caller_net; sk = sk_lookup(net, tuple, ifindex, sdif, family, proto); } else { net = get_net_ns_by_id(caller_net, netns_id); if (unlikely(!net)) goto out; sk = sk_lookup(net, tuple, ifindex, sdif, family, proto); put_net(net); } out: return sk; } static struct sock * __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id, u64 flags, int sdif) { struct sock *sk = __bpf_skc_lookup(skb, tuple, len, caller_net, ifindex, proto, netns_id, flags, sdif); if (sk) { struct sock *sk2 = sk_to_full_sk(sk); /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk * sock refcnt is decremented to prevent a request_sock leak. */ if (sk2 != sk) { sock_gen_put(sk); /* Ensure there is no need to bump sk2 refcnt */ if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) { WARN_ONCE(1, "Found non-RCU, unreferenced socket!"); return NULL; } sk = sk2; } } return sk; } static struct sock * bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, u8 proto, u64 netns_id, u64 flags) { struct net *caller_net; int ifindex; if (skb->dev) { caller_net = dev_net(skb->dev); ifindex = skb->dev->ifindex; } else { caller_net = sock_net(skb->sk); ifindex = 0; } return __bpf_skc_lookup(skb, tuple, len, caller_net, ifindex, proto, netns_id, flags, -1); } static struct sock * bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, u8 proto, u64 netns_id, u64 flags) { struct sock *sk = bpf_skc_lookup(skb, tuple, len, proto, netns_id, flags); if (sk) { struct sock *sk2 = sk_to_full_sk(sk); /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk * sock refcnt is decremented to prevent a request_sock leak. */ if (sk2 != sk) { sock_gen_put(sk); /* Ensure there is no need to bump sk2 refcnt */ if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) { WARN_ONCE(1, "Found non-RCU, unreferenced socket!"); return NULL; } sk = sk2; } } return sk; } BPF_CALL_5(bpf_skc_lookup_tcp, struct sk_buff *, skb, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { return (unsigned long)bpf_skc_lookup(skb, tuple, len, IPPROTO_TCP, netns_id, flags); } static const struct bpf_func_proto bpf_skc_lookup_tcp_proto = { .func = bpf_skc_lookup_tcp, .gpl_only = false, .pkt_access = true, .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_sk_lookup_tcp, struct sk_buff *, skb, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { return (unsigned long)bpf_sk_lookup(skb, tuple, len, IPPROTO_TCP, netns_id, flags); } static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = { .func = bpf_sk_lookup_tcp, .gpl_only = false, .pkt_access = true, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_sk_lookup_udp, struct sk_buff *, skb, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { return (unsigned long)bpf_sk_lookup(skb, tuple, len, IPPROTO_UDP, netns_id, flags); } static const struct bpf_func_proto bpf_sk_lookup_udp_proto = { .func = bpf_sk_lookup_udp, .gpl_only = false, .pkt_access = true, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_tc_skc_lookup_tcp, struct sk_buff *, skb, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { struct net_device *dev = skb->dev; int ifindex = dev->ifindex, sdif = dev_sdif(dev); struct net *caller_net = dev_net(dev); return (unsigned long)__bpf_skc_lookup(skb, tuple, len, caller_net, ifindex, IPPROTO_TCP, netns_id, flags, sdif); } static const struct bpf_func_proto bpf_tc_skc_lookup_tcp_proto = { .func = bpf_tc_skc_lookup_tcp, .gpl_only = false, .pkt_access = true, .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_tc_sk_lookup_tcp, struct sk_buff *, skb, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { struct net_device *dev = skb->dev; int ifindex = dev->ifindex, sdif = dev_sdif(dev); struct net *caller_net = dev_net(dev); return (unsigned long)__bpf_sk_lookup(skb, tuple, len, caller_net, ifindex, IPPROTO_TCP, netns_id, flags, sdif); } static const struct bpf_func_proto bpf_tc_sk_lookup_tcp_proto = { .func = bpf_tc_sk_lookup_tcp, .gpl_only = false, .pkt_access = true, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_tc_sk_lookup_udp, struct sk_buff *, skb, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { struct net_device *dev = skb->dev; int ifindex = dev->ifindex, sdif = dev_sdif(dev); struct net *caller_net = dev_net(dev); return (unsigned long)__bpf_sk_lookup(skb, tuple, len, caller_net, ifindex, IPPROTO_UDP, netns_id, flags, sdif); } static const struct bpf_func_proto bpf_tc_sk_lookup_udp_proto = { .func = bpf_tc_sk_lookup_udp, .gpl_only = false, .pkt_access = true, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_1(bpf_sk_release, struct sock *, sk) { if (sk && sk_is_refcounted(sk)) sock_gen_put(sk); return 0; } static const struct bpf_func_proto bpf_sk_release_proto = { .func = bpf_sk_release, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON | OBJ_RELEASE, }; BPF_CALL_5(bpf_xdp_sk_lookup_udp, struct xdp_buff *, ctx, struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) { struct net_device *dev = ctx->rxq->dev; int ifindex = dev->ifindex, sdif = dev_sdif(dev); struct net *caller_net = dev_net(dev); return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net, ifindex, IPPROTO_UDP, netns_id, flags, sdif); } static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = { .func = bpf_xdp_sk_lookup_udp, .gpl_only = false, .pkt_access = true, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_xdp_skc_lookup_tcp, struct xdp_buff *, ctx, struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) { struct net_device *dev = ctx->rxq->dev; int ifindex = dev->ifindex, sdif = dev_sdif(dev); struct net *caller_net = dev_net(dev); return (unsigned long)__bpf_skc_lookup(NULL, tuple, len, caller_net, ifindex, IPPROTO_TCP, netns_id, flags, sdif); } static const struct bpf_func_proto bpf_xdp_skc_lookup_tcp_proto = { .func = bpf_xdp_skc_lookup_tcp, .gpl_only = false, .pkt_access = true, .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_xdp_sk_lookup_tcp, struct xdp_buff *, ctx, struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) { struct net_device *dev = ctx->rxq->dev; int ifindex = dev->ifindex, sdif = dev_sdif(dev); struct net *caller_net = dev_net(dev); return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net, ifindex, IPPROTO_TCP, netns_id, flags, sdif); } static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = { .func = bpf_xdp_sk_lookup_tcp, .gpl_only = false, .pkt_access = true, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_sock_addr_skc_lookup_tcp, struct bpf_sock_addr_kern *, ctx, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { return (unsigned long)__bpf_skc_lookup(NULL, tuple, len, sock_net(ctx->sk), 0, IPPROTO_TCP, netns_id, flags, -1); } static const struct bpf_func_proto bpf_sock_addr_skc_lookup_tcp_proto = { .func = bpf_sock_addr_skc_lookup_tcp, .gpl_only = false, .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_sock_addr_sk_lookup_tcp, struct bpf_sock_addr_kern *, ctx, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, sock_net(ctx->sk), 0, IPPROTO_TCP, netns_id, flags, -1); } static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = { .func = bpf_sock_addr_sk_lookup_tcp, .gpl_only = false, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_sock_addr_sk_lookup_udp, struct bpf_sock_addr_kern *, ctx, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, sock_net(ctx->sk), 0, IPPROTO_UDP, netns_id, flags, -1); } static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = { .func = bpf_sock_addr_sk_lookup_udp, .gpl_only = false, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { if (off < 0 || off >= offsetofend(struct bpf_tcp_sock, icsk_retransmits)) return false; if (off % size != 0) return false; switch (off) { case offsetof(struct bpf_tcp_sock, bytes_received): case offsetof(struct bpf_tcp_sock, bytes_acked): return size == sizeof(__u64); default: return size == sizeof(__u32); } } u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; #define BPF_TCP_SOCK_GET_COMMON(FIELD) \ do { \ BUILD_BUG_ON(sizeof_field(struct tcp_sock, FIELD) > \ sizeof_field(struct bpf_tcp_sock, FIELD)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_sock, FIELD),\ si->dst_reg, si->src_reg, \ offsetof(struct tcp_sock, FIELD)); \ } while (0) #define BPF_INET_SOCK_GET_COMMON(FIELD) \ do { \ BUILD_BUG_ON(sizeof_field(struct inet_connection_sock, \ FIELD) > \ sizeof_field(struct bpf_tcp_sock, FIELD)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct inet_connection_sock, \ FIELD), \ si->dst_reg, si->src_reg, \ offsetof( \ struct inet_connection_sock, \ FIELD)); \ } while (0) BTF_TYPE_EMIT(struct bpf_tcp_sock); switch (si->off) { case offsetof(struct bpf_tcp_sock, rtt_min): BUILD_BUG_ON(sizeof_field(struct tcp_sock, rtt_min) != sizeof(struct minmax)); BUILD_BUG_ON(sizeof(struct minmax) < sizeof(struct minmax_sample)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct tcp_sock, rtt_min) + offsetof(struct minmax_sample, v)); break; case offsetof(struct bpf_tcp_sock, snd_cwnd): BPF_TCP_SOCK_GET_COMMON(snd_cwnd); break; case offsetof(struct bpf_tcp_sock, srtt_us): BPF_TCP_SOCK_GET_COMMON(srtt_us); break; case offsetof(struct bpf_tcp_sock, snd_ssthresh): BPF_TCP_SOCK_GET_COMMON(snd_ssthresh); break; case offsetof(struct bpf_tcp_sock, rcv_nxt): BPF_TCP_SOCK_GET_COMMON(rcv_nxt); break; case offsetof(struct bpf_tcp_sock, snd_nxt): BPF_TCP_SOCK_GET_COMMON(snd_nxt); break; case offsetof(struct bpf_tcp_sock, snd_una): BPF_TCP_SOCK_GET_COMMON(snd_una); break; case offsetof(struct bpf_tcp_sock, mss_cache): BPF_TCP_SOCK_GET_COMMON(mss_cache); break; case offsetof(struct bpf_tcp_sock, ecn_flags): BPF_TCP_SOCK_GET_COMMON(ecn_flags); break; case offsetof(struct bpf_tcp_sock, rate_delivered): BPF_TCP_SOCK_GET_COMMON(rate_delivered); break; case offsetof(struct bpf_tcp_sock, rate_interval_us): BPF_TCP_SOCK_GET_COMMON(rate_interval_us); break; case offsetof(struct bpf_tcp_sock, packets_out): BPF_TCP_SOCK_GET_COMMON(packets_out); break; case offsetof(struct bpf_tcp_sock, retrans_out): BPF_TCP_SOCK_GET_COMMON(retrans_out); break; case offsetof(struct bpf_tcp_sock, total_retrans): BPF_TCP_SOCK_GET_COMMON(total_retrans); break; case offsetof(struct bpf_tcp_sock, segs_in): BPF_TCP_SOCK_GET_COMMON(segs_in); break; case offsetof(struct bpf_tcp_sock, data_segs_in): BPF_TCP_SOCK_GET_COMMON(data_segs_in); break; case offsetof(struct bpf_tcp_sock, segs_out): BPF_TCP_SOCK_GET_COMMON(segs_out); break; case offsetof(struct bpf_tcp_sock, data_segs_out): BPF_TCP_SOCK_GET_COMMON(data_segs_out); break; case offsetof(struct bpf_tcp_sock, lost_out): BPF_TCP_SOCK_GET_COMMON(lost_out); break; case offsetof(struct bpf_tcp_sock, sacked_out): BPF_TCP_SOCK_GET_COMMON(sacked_out); break; case offsetof(struct bpf_tcp_sock, bytes_received): BPF_TCP_SOCK_GET_COMMON(bytes_received); break; case offsetof(struct bpf_tcp_sock, bytes_acked): BPF_TCP_SOCK_GET_COMMON(bytes_acked); break; case offsetof(struct bpf_tcp_sock, dsack_dups): BPF_TCP_SOCK_GET_COMMON(dsack_dups); break; case offsetof(struct bpf_tcp_sock, delivered): BPF_TCP_SOCK_GET_COMMON(delivered); break; case offsetof(struct bpf_tcp_sock, delivered_ce): BPF_TCP_SOCK_GET_COMMON(delivered_ce); break; case offsetof(struct bpf_tcp_sock, icsk_retransmits): BPF_INET_SOCK_GET_COMMON(icsk_retransmits); break; } return insn - insn_buf; } BPF_CALL_1(bpf_tcp_sock, struct sock *, sk) { if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP) return (unsigned long)sk; return (unsigned long)NULL; } const struct bpf_func_proto bpf_tcp_sock_proto = { .func = bpf_tcp_sock, .gpl_only = false, .ret_type = RET_PTR_TO_TCP_SOCK_OR_NULL, .arg1_type = ARG_PTR_TO_SOCK_COMMON, }; BPF_CALL_1(bpf_get_listener_sock, struct sock *, sk) { sk = sk_to_full_sk(sk); if (sk && sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_RCU_FREE)) return (unsigned long)sk; return (unsigned long)NULL; } static const struct bpf_func_proto bpf_get_listener_sock_proto = { .func = bpf_get_listener_sock, .gpl_only = false, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_SOCK_COMMON, }; BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb) { unsigned int iphdr_len; switch (skb_protocol(skb, true)) { case cpu_to_be16(ETH_P_IP): iphdr_len = sizeof(struct iphdr); break; case cpu_to_be16(ETH_P_IPV6): iphdr_len = sizeof(struct ipv6hdr); break; default: return 0; } if (skb_headlen(skb) < iphdr_len) return 0; if (skb_cloned(skb) && !skb_clone_writable(skb, iphdr_len)) return 0; return INET_ECN_set_ce(skb); } bool bpf_xdp_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { if (off < 0 || off >= offsetofend(struct bpf_xdp_sock, queue_id)) return false; if (off % size != 0) return false; switch (off) { default: return size == sizeof(__u32); } } u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; #define BPF_XDP_SOCK_GET(FIELD) \ do { \ BUILD_BUG_ON(sizeof_field(struct xdp_sock, FIELD) > \ sizeof_field(struct bpf_xdp_sock, FIELD)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_sock, FIELD),\ si->dst_reg, si->src_reg, \ offsetof(struct xdp_sock, FIELD)); \ } while (0) BTF_TYPE_EMIT(struct bpf_xdp_sock); switch (si->off) { case offsetof(struct bpf_xdp_sock, queue_id): BPF_XDP_SOCK_GET(queue_id); break; } return insn - insn_buf; } static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = { .func = bpf_skb_ecn_set_ce, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len, struct tcphdr *, th, u32, th_len) { #ifdef CONFIG_SYN_COOKIES int ret; if (unlikely(!sk || th_len < sizeof(*th))) return -EINVAL; /* sk_listener() allows TCP_NEW_SYN_RECV, which makes no sense here. */ if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN) return -EINVAL; if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies)) return -EINVAL; if (!th->ack || th->rst || th->syn) return -ENOENT; if (unlikely(iph_len < sizeof(struct iphdr))) return -EINVAL; if (tcp_synq_no_recent_overflow(sk)) return -ENOENT; /* Both struct iphdr and struct ipv6hdr have the version field at the * same offset so we can cast to the shorter header (struct iphdr). */ switch (((struct iphdr *)iph)->version) { case 4: if (sk->sk_family == AF_INET6 && ipv6_only_sock(sk)) return -EINVAL; ret = __cookie_v4_check((struct iphdr *)iph, th); break; #if IS_BUILTIN(CONFIG_IPV6) case 6: if (unlikely(iph_len < sizeof(struct ipv6hdr))) return -EINVAL; if (sk->sk_family != AF_INET6) return -EINVAL; ret = __cookie_v6_check((struct ipv6hdr *)iph, th); break; #endif /* CONFIG_IPV6 */ default: return -EPROTONOSUPPORT; } if (ret > 0) return 0; return -ENOENT; #else return -ENOTSUPP; #endif } static const struct bpf_func_proto bpf_tcp_check_syncookie_proto = { .func = bpf_tcp_check_syncookie, .gpl_only = true, .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, }; BPF_CALL_5(bpf_tcp_gen_syncookie, struct sock *, sk, void *, iph, u32, iph_len, struct tcphdr *, th, u32, th_len) { #ifdef CONFIG_SYN_COOKIES u32 cookie; u16 mss; if (unlikely(!sk || th_len < sizeof(*th) || th_len != th->doff * 4)) return -EINVAL; if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN) return -EINVAL; if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies)) return -ENOENT; if (!th->syn || th->ack || th->fin || th->rst) return -EINVAL; if (unlikely(iph_len < sizeof(struct iphdr))) return -EINVAL; /* Both struct iphdr and struct ipv6hdr have the version field at the * same offset so we can cast to the shorter header (struct iphdr). */ switch (((struct iphdr *)iph)->version) { case 4: if (sk->sk_family == AF_INET6 && ipv6_only_sock(sk)) return -EINVAL; mss = tcp_v4_get_syncookie(sk, iph, th, &cookie); break; #if IS_BUILTIN(CONFIG_IPV6) case 6: if (unlikely(iph_len < sizeof(struct ipv6hdr))) return -EINVAL; if (sk->sk_family != AF_INET6) return -EINVAL; mss = tcp_v6_get_syncookie(sk, iph, th, &cookie); break; #endif /* CONFIG_IPV6 */ default: return -EPROTONOSUPPORT; } if (mss == 0) return -ENOENT; return cookie | ((u64)mss << 32); #else return -EOPNOTSUPP; #endif /* CONFIG_SYN_COOKIES */ } static const struct bpf_func_proto bpf_tcp_gen_syncookie_proto = { .func = bpf_tcp_gen_syncookie, .gpl_only = true, /* __cookie_v*_init_sequence() is GPL */ .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, }; BPF_CALL_3(bpf_sk_assign, struct sk_buff *, skb, struct sock *, sk, u64, flags) { if (!sk || flags != 0) return -EINVAL; if (!skb_at_tc_ingress(skb)) return -EOPNOTSUPP; if (unlikely(dev_net(skb->dev) != sock_net(sk))) return -ENETUNREACH; if (sk_unhashed(sk)) return -EOPNOTSUPP; if (sk_is_refcounted(sk) && unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) return -ENOENT; skb_orphan(skb); skb->sk = sk; skb->destructor = sock_pfree; return 0; } static const struct bpf_func_proto bpf_sk_assign_proto = { .func = bpf_sk_assign, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg3_type = ARG_ANYTHING, }; static const u8 *bpf_search_tcp_opt(const u8 *op, const u8 *opend, u8 search_kind, const u8 *magic, u8 magic_len, bool *eol) { u8 kind, kind_len; *eol = false; while (op < opend) { kind = op[0]; if (kind == TCPOPT_EOL) { *eol = true; return ERR_PTR(-ENOMSG); } else if (kind == TCPOPT_NOP) { op++; continue; } if (opend - op < 2 || opend - op < op[1] || op[1] < 2) /* Something is wrong in the received header. * Follow the TCP stack's tcp_parse_options() * and just bail here. */ return ERR_PTR(-EFAULT); kind_len = op[1]; if (search_kind == kind) { if (!magic_len) return op; if (magic_len > kind_len - 2) return ERR_PTR(-ENOMSG); if (!memcmp(&op[2], magic, magic_len)) return op; } op += kind_len; } return ERR_PTR(-ENOMSG); } BPF_CALL_4(bpf_sock_ops_load_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock, void *, search_res, u32, len, u64, flags) { bool eol, load_syn = flags & BPF_LOAD_HDR_OPT_TCP_SYN; const u8 *op, *opend, *magic, *search = search_res; u8 search_kind, search_len, copy_len, magic_len; int ret; if (!is_locked_tcp_sock_ops(bpf_sock)) return -EOPNOTSUPP; /* 2 byte is the minimal option len except TCPOPT_NOP and * TCPOPT_EOL which are useless for the bpf prog to learn * and this helper disallow loading them also. */ if (len < 2 || flags & ~BPF_LOAD_HDR_OPT_TCP_SYN) return -EINVAL; search_kind = search[0]; search_len = search[1]; if (search_len > len || search_kind == TCPOPT_NOP || search_kind == TCPOPT_EOL) return -EINVAL; if (search_kind == TCPOPT_EXP || search_kind == 253) { /* 16 or 32 bit magic. +2 for kind and kind length */ if (search_len != 4 && search_len != 6) return -EINVAL; magic = &search[2]; magic_len = search_len - 2; } else { if (search_len) return -EINVAL; magic = NULL; magic_len = 0; } if (load_syn) { ret = bpf_sock_ops_get_syn(bpf_sock, TCP_BPF_SYN, &op); if (ret < 0) return ret; opend = op + ret; op += sizeof(struct tcphdr); } else { if (!bpf_sock->skb || bpf_sock->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB) /* This bpf_sock->op cannot call this helper */ return -EPERM; opend = bpf_sock->skb_data_end; op = bpf_sock->skb->data + sizeof(struct tcphdr); } op = bpf_search_tcp_opt(op, opend, search_kind, magic, magic_len, &eol); if (IS_ERR(op)) return PTR_ERR(op); copy_len = op[1]; ret = copy_len; if (copy_len > len) { ret = -ENOSPC; copy_len = len; } memcpy(search_res, op, copy_len); return ret; } static const struct bpf_func_proto bpf_sock_ops_load_hdr_opt_proto = { .func = bpf_sock_ops_load_hdr_opt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_WRITE, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_sock_ops_store_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock, const void *, from, u32, len, u64, flags) { u8 new_kind, new_kind_len, magic_len = 0, *opend; const u8 *op, *new_op, *magic = NULL; struct sk_buff *skb; bool eol; if (bpf_sock->op != BPF_SOCK_OPS_WRITE_HDR_OPT_CB) return -EPERM; if (len < 2 || flags) return -EINVAL; new_op = from; new_kind = new_op[0]; new_kind_len = new_op[1]; if (new_kind_len > len || new_kind == TCPOPT_NOP || new_kind == TCPOPT_EOL) return -EINVAL; if (new_kind_len > bpf_sock->remaining_opt_len) return -ENOSPC; /* 253 is another experimental kind */ if (new_kind == TCPOPT_EXP || new_kind == 253) { if (new_kind_len < 4) return -EINVAL; /* Match for the 2 byte magic also. * RFC 6994: the magic could be 2 or 4 bytes. * Hence, matching by 2 byte only is on the * conservative side but it is the right * thing to do for the 'search-for-duplication' * purpose. */ magic = &new_op[2]; magic_len = 2; } /* Check for duplication */ skb = bpf_sock->skb; op = skb->data + sizeof(struct tcphdr); opend = bpf_sock->skb_data_end; op = bpf_search_tcp_opt(op, opend, new_kind, magic, magic_len, &eol); if (!IS_ERR(op)) return -EEXIST; if (PTR_ERR(op) != -ENOMSG) return PTR_ERR(op); if (eol) /* The option has been ended. Treat it as no more * header option can be written. */ return -ENOSPC; /* No duplication found. Store the header option. */ memcpy(opend, from, new_kind_len); bpf_sock->remaining_opt_len -= new_kind_len; bpf_sock->skb_data_end += new_kind_len; return 0; } static const struct bpf_func_proto bpf_sock_ops_store_hdr_opt_proto = { .func = bpf_sock_ops_store_hdr_opt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; BPF_CALL_3(bpf_sock_ops_reserve_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock, u32, len, u64, flags) { if (bpf_sock->op != BPF_SOCK_OPS_HDR_OPT_LEN_CB) return -EPERM; if (flags || len < 2) return -EINVAL; if (len > bpf_sock->remaining_opt_len) return -ENOSPC; bpf_sock->remaining_opt_len -= len; return 0; } static const struct bpf_func_proto bpf_sock_ops_reserve_hdr_opt_proto = { .func = bpf_sock_ops_reserve_hdr_opt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; BPF_CALL_3(bpf_skb_set_tstamp, struct sk_buff *, skb, u64, tstamp, u32, tstamp_type) { /* skb_clear_delivery_time() is done for inet protocol */ if (skb->protocol != htons(ETH_P_IP) && skb->protocol != htons(ETH_P_IPV6)) return -EOPNOTSUPP; switch (tstamp_type) { case BPF_SKB_CLOCK_REALTIME: skb->tstamp = tstamp; skb->tstamp_type = SKB_CLOCK_REALTIME; break; case BPF_SKB_CLOCK_MONOTONIC: if (!tstamp) return -EINVAL; skb->tstamp = tstamp; skb->tstamp_type = SKB_CLOCK_MONOTONIC; break; case BPF_SKB_CLOCK_TAI: if (!tstamp) return -EINVAL; skb->tstamp = tstamp; skb->tstamp_type = SKB_CLOCK_TAI; break; default: return -EINVAL; } return 0; } static const struct bpf_func_proto bpf_skb_set_tstamp_proto = { .func = bpf_skb_set_tstamp, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; #ifdef CONFIG_SYN_COOKIES BPF_CALL_3(bpf_tcp_raw_gen_syncookie_ipv4, struct iphdr *, iph, struct tcphdr *, th, u32, th_len) { u32 cookie; u16 mss; if (unlikely(th_len < sizeof(*th) || th_len != th->doff * 4)) return -EINVAL; mss = tcp_parse_mss_option(th, 0) ?: TCP_MSS_DEFAULT; cookie = __cookie_v4_init_sequence(iph, th, &mss); return cookie | ((u64)mss << 32); } static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv4_proto = { .func = bpf_tcp_raw_gen_syncookie_ipv4, .gpl_only = true, /* __cookie_v4_init_sequence() is GPL */ .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM, .arg1_size = sizeof(struct iphdr), .arg2_type = ARG_PTR_TO_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, }; BPF_CALL_3(bpf_tcp_raw_gen_syncookie_ipv6, struct ipv6hdr *, iph, struct tcphdr *, th, u32, th_len) { #if IS_BUILTIN(CONFIG_IPV6) const u16 mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); u32 cookie; u16 mss; if (unlikely(th_len < sizeof(*th) || th_len != th->doff * 4)) return -EINVAL; mss = tcp_parse_mss_option(th, 0) ?: mss_clamp; cookie = __cookie_v6_init_sequence(iph, th, &mss); return cookie | ((u64)mss << 32); #else return -EPROTONOSUPPORT; #endif } static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv6_proto = { .func = bpf_tcp_raw_gen_syncookie_ipv6, .gpl_only = true, /* __cookie_v6_init_sequence() is GPL */ .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM, .arg1_size = sizeof(struct ipv6hdr), .arg2_type = ARG_PTR_TO_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, }; BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv4, struct iphdr *, iph, struct tcphdr *, th) { if (__cookie_v4_check(iph, th) > 0) return 0; return -EACCES; } static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv4_proto = { .func = bpf_tcp_raw_check_syncookie_ipv4, .gpl_only = true, /* __cookie_v4_check is GPL */ .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM, .arg1_size = sizeof(struct iphdr), .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM, .arg2_size = sizeof(struct tcphdr), }; BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv6, struct ipv6hdr *, iph, struct tcphdr *, th) { #if IS_BUILTIN(CONFIG_IPV6) if (__cookie_v6_check(iph, th) > 0) return 0; return -EACCES; #else return -EPROTONOSUPPORT; #endif } static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv6_proto = { .func = bpf_tcp_raw_check_syncookie_ipv6, .gpl_only = true, /* __cookie_v6_check is GPL */ .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM, .arg1_size = sizeof(struct ipv6hdr), .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM, .arg2_size = sizeof(struct tcphdr), }; #endif /* CONFIG_SYN_COOKIES */ #endif /* CONFIG_INET */ bool bpf_helper_changes_pkt_data(enum bpf_func_id func_id) { switch (func_id) { case BPF_FUNC_clone_redirect: case BPF_FUNC_l3_csum_replace: case BPF_FUNC_l4_csum_replace: case BPF_FUNC_lwt_push_encap: case BPF_FUNC_lwt_seg6_action: case BPF_FUNC_lwt_seg6_adjust_srh: case BPF_FUNC_lwt_seg6_store_bytes: case BPF_FUNC_msg_pop_data: case BPF_FUNC_msg_pull_data: case BPF_FUNC_msg_push_data: case BPF_FUNC_skb_adjust_room: case BPF_FUNC_skb_change_head: case BPF_FUNC_skb_change_proto: case BPF_FUNC_skb_change_tail: case BPF_FUNC_skb_pull_data: case BPF_FUNC_skb_store_bytes: case BPF_FUNC_skb_vlan_pop: case BPF_FUNC_skb_vlan_push: case BPF_FUNC_store_hdr_opt: case BPF_FUNC_xdp_adjust_head: case BPF_FUNC_xdp_adjust_meta: case BPF_FUNC_xdp_adjust_tail: /* tail-called program could call any of the above */ case BPF_FUNC_tail_call: return true; default: return false; } } const struct bpf_func_proto bpf_event_output_data_proto __weak; const struct bpf_func_proto bpf_sk_storage_get_cg_sock_proto __weak; static const struct bpf_func_proto * sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { const struct bpf_func_proto *func_proto; func_proto = cgroup_common_func_proto(func_id, prog); if (func_proto) return func_proto; switch (func_id) { case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_sock_proto; case BPF_FUNC_get_netns_cookie: return &bpf_get_netns_cookie_sock_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_cg_sock_proto; case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; case BPF_FUNC_setsockopt: switch (prog->expected_attach_type) { case BPF_CGROUP_INET_SOCK_CREATE: return &bpf_sock_create_setsockopt_proto; default: return NULL; } case BPF_FUNC_getsockopt: switch (prog->expected_attach_type) { case BPF_CGROUP_INET_SOCK_CREATE: return &bpf_sock_create_getsockopt_proto; default: return NULL; } default: return bpf_base_func_proto(func_id, prog); } } static const struct bpf_func_proto * sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { const struct bpf_func_proto *func_proto; func_proto = cgroup_common_func_proto(func_id, prog); if (func_proto) return func_proto; switch (func_id) { case BPF_FUNC_bind: switch (prog->expected_attach_type) { case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: return &bpf_bind_proto; default: return NULL; } case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_sock_addr_proto; case BPF_FUNC_get_netns_cookie: return &bpf_get_netns_cookie_sock_addr_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_tcp: return &bpf_sock_addr_sk_lookup_tcp_proto; case BPF_FUNC_sk_lookup_udp: return &bpf_sock_addr_sk_lookup_udp_proto; case BPF_FUNC_sk_release: return &bpf_sk_release_proto; case BPF_FUNC_skc_lookup_tcp: return &bpf_sock_addr_skc_lookup_tcp_proto; #endif /* CONFIG_INET */ case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; case BPF_FUNC_setsockopt: switch (prog->expected_attach_type) { case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_UNIX_CONNECT: case BPF_CGROUP_UDP4_RECVMSG: case BPF_CGROUP_UDP6_RECVMSG: case BPF_CGROUP_UNIX_RECVMSG: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UNIX_SENDMSG: case BPF_CGROUP_INET4_GETPEERNAME: case BPF_CGROUP_INET6_GETPEERNAME: case BPF_CGROUP_UNIX_GETPEERNAME: case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UNIX_GETSOCKNAME: return &bpf_sock_addr_setsockopt_proto; default: return NULL; } case BPF_FUNC_getsockopt: switch (prog->expected_attach_type) { case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_UNIX_CONNECT: case BPF_CGROUP_UDP4_RECVMSG: case BPF_CGROUP_UDP6_RECVMSG: case BPF_CGROUP_UNIX_RECVMSG: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UNIX_SENDMSG: case BPF_CGROUP_INET4_GETPEERNAME: case BPF_CGROUP_INET6_GETPEERNAME: case BPF_CGROUP_UNIX_GETPEERNAME: case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UNIX_GETSOCKNAME: return &bpf_sock_addr_getsockopt_proto; default: return NULL; } default: return bpf_sk_base_func_proto(func_id, prog); } } static const struct bpf_func_proto * sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_load_bytes: return &bpf_skb_load_bytes_proto; case BPF_FUNC_skb_load_bytes_relative: return &bpf_skb_load_bytes_relative_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_proto; case BPF_FUNC_get_netns_cookie: return &bpf_get_netns_cookie_proto; case BPF_FUNC_get_socket_uid: return &bpf_get_socket_uid_proto; case BPF_FUNC_perf_event_output: return &bpf_skb_event_output_proto; default: return bpf_sk_base_func_proto(func_id, prog); } } const struct bpf_func_proto bpf_sk_storage_get_proto __weak; const struct bpf_func_proto bpf_sk_storage_delete_proto __weak; static const struct bpf_func_proto * cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { const struct bpf_func_proto *func_proto; func_proto = cgroup_common_func_proto(func_id, prog); if (func_proto) return func_proto; switch (func_id) { case BPF_FUNC_sk_fullsock: return &bpf_sk_fullsock_proto; case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; case BPF_FUNC_perf_event_output: return &bpf_skb_event_output_proto; #ifdef CONFIG_SOCK_CGROUP_DATA case BPF_FUNC_skb_cgroup_id: return &bpf_skb_cgroup_id_proto; case BPF_FUNC_skb_ancestor_cgroup_id: return &bpf_skb_ancestor_cgroup_id_proto; case BPF_FUNC_sk_cgroup_id: return &bpf_sk_cgroup_id_proto; case BPF_FUNC_sk_ancestor_cgroup_id: return &bpf_sk_ancestor_cgroup_id_proto; #endif #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_tcp: return &bpf_sk_lookup_tcp_proto; case BPF_FUNC_sk_lookup_udp: return &bpf_sk_lookup_udp_proto; case BPF_FUNC_sk_release: return &bpf_sk_release_proto; case BPF_FUNC_skc_lookup_tcp: return &bpf_skc_lookup_tcp_proto; case BPF_FUNC_tcp_sock: return &bpf_tcp_sock_proto; case BPF_FUNC_get_listener_sock: return &bpf_get_listener_sock_proto; case BPF_FUNC_skb_ecn_set_ce: return &bpf_skb_ecn_set_ce_proto; #endif default: return sk_filter_func_proto(func_id, prog); } } static const struct bpf_func_proto * tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_store_bytes: return &bpf_skb_store_bytes_proto; case BPF_FUNC_skb_load_bytes: return &bpf_skb_load_bytes_proto; case BPF_FUNC_skb_load_bytes_relative: return &bpf_skb_load_bytes_relative_proto; case BPF_FUNC_skb_pull_data: return &bpf_skb_pull_data_proto; case BPF_FUNC_csum_diff: return &bpf_csum_diff_proto; case BPF_FUNC_csum_update: return &bpf_csum_update_proto; case BPF_FUNC_csum_level: return &bpf_csum_level_proto; case BPF_FUNC_l3_csum_replace: return &bpf_l3_csum_replace_proto; case BPF_FUNC_l4_csum_replace: return &bpf_l4_csum_replace_proto; case BPF_FUNC_clone_redirect: return &bpf_clone_redirect_proto; case BPF_FUNC_get_cgroup_classid: return &bpf_get_cgroup_classid_proto; case BPF_FUNC_skb_vlan_push: return &bpf_skb_vlan_push_proto; case BPF_FUNC_skb_vlan_pop: return &bpf_skb_vlan_pop_proto; case BPF_FUNC_skb_change_proto: return &bpf_skb_change_proto_proto; case BPF_FUNC_skb_change_type: return &bpf_skb_change_type_proto; case BPF_FUNC_skb_adjust_room: return &bpf_skb_adjust_room_proto; case BPF_FUNC_skb_change_tail: return &bpf_skb_change_tail_proto; case BPF_FUNC_skb_change_head: return &bpf_skb_change_head_proto; case BPF_FUNC_skb_get_tunnel_key: return &bpf_skb_get_tunnel_key_proto; case BPF_FUNC_skb_set_tunnel_key: return bpf_get_skb_set_tunnel_proto(func_id); case BPF_FUNC_skb_get_tunnel_opt: return &bpf_skb_get_tunnel_opt_proto; case BPF_FUNC_skb_set_tunnel_opt: return bpf_get_skb_set_tunnel_proto(func_id); case BPF_FUNC_redirect: return &bpf_redirect_proto; case BPF_FUNC_redirect_neigh: return &bpf_redirect_neigh_proto; case BPF_FUNC_redirect_peer: return &bpf_redirect_peer_proto; case BPF_FUNC_get_route_realm: return &bpf_get_route_realm_proto; case BPF_FUNC_get_hash_recalc: return &bpf_get_hash_recalc_proto; case BPF_FUNC_set_hash_invalid: return &bpf_set_hash_invalid_proto; case BPF_FUNC_set_hash: return &bpf_set_hash_proto; case BPF_FUNC_perf_event_output: return &bpf_skb_event_output_proto; case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; case BPF_FUNC_skb_under_cgroup: return &bpf_skb_under_cgroup_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_proto; case BPF_FUNC_get_netns_cookie: return &bpf_get_netns_cookie_proto; case BPF_FUNC_get_socket_uid: return &bpf_get_socket_uid_proto; case BPF_FUNC_fib_lookup: return &bpf_skb_fib_lookup_proto; case BPF_FUNC_check_mtu: return &bpf_skb_check_mtu_proto; case BPF_FUNC_sk_fullsock: return &bpf_sk_fullsock_proto; case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; #ifdef CONFIG_XFRM case BPF_FUNC_skb_get_xfrm_state: return &bpf_skb_get_xfrm_state_proto; #endif #ifdef CONFIG_CGROUP_NET_CLASSID case BPF_FUNC_skb_cgroup_classid: return &bpf_skb_cgroup_classid_proto; #endif #ifdef CONFIG_SOCK_CGROUP_DATA case BPF_FUNC_skb_cgroup_id: return &bpf_skb_cgroup_id_proto; case BPF_FUNC_skb_ancestor_cgroup_id: return &bpf_skb_ancestor_cgroup_id_proto; #endif #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_tcp: return &bpf_tc_sk_lookup_tcp_proto; case BPF_FUNC_sk_lookup_udp: return &bpf_tc_sk_lookup_udp_proto; case BPF_FUNC_sk_release: return &bpf_sk_release_proto; case BPF_FUNC_tcp_sock: return &bpf_tcp_sock_proto; case BPF_FUNC_get_listener_sock: return &bpf_get_listener_sock_proto; case BPF_FUNC_skc_lookup_tcp: return &bpf_tc_skc_lookup_tcp_proto; case BPF_FUNC_tcp_check_syncookie: return &bpf_tcp_check_syncookie_proto; case BPF_FUNC_skb_ecn_set_ce: return &bpf_skb_ecn_set_ce_proto; case BPF_FUNC_tcp_gen_syncookie: return &bpf_tcp_gen_syncookie_proto; case BPF_FUNC_sk_assign: return &bpf_sk_assign_proto; case BPF_FUNC_skb_set_tstamp: return &bpf_skb_set_tstamp_proto; #ifdef CONFIG_SYN_COOKIES case BPF_FUNC_tcp_raw_gen_syncookie_ipv4: return &bpf_tcp_raw_gen_syncookie_ipv4_proto; case BPF_FUNC_tcp_raw_gen_syncookie_ipv6: return &bpf_tcp_raw_gen_syncookie_ipv6_proto; case BPF_FUNC_tcp_raw_check_syncookie_ipv4: return &bpf_tcp_raw_check_syncookie_ipv4_proto; case BPF_FUNC_tcp_raw_check_syncookie_ipv6: return &bpf_tcp_raw_check_syncookie_ipv6_proto; #endif #endif default: return bpf_sk_base_func_proto(func_id, prog); } } static const struct bpf_func_proto * xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_perf_event_output: return &bpf_xdp_event_output_proto; case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; case BPF_FUNC_csum_diff: return &bpf_csum_diff_proto; case BPF_FUNC_xdp_adjust_head: return &bpf_xdp_adjust_head_proto; case BPF_FUNC_xdp_adjust_meta: return &bpf_xdp_adjust_meta_proto; case BPF_FUNC_redirect: return &bpf_xdp_redirect_proto; case BPF_FUNC_redirect_map: return &bpf_xdp_redirect_map_proto; case BPF_FUNC_xdp_adjust_tail: return &bpf_xdp_adjust_tail_proto; case BPF_FUNC_xdp_get_buff_len: return &bpf_xdp_get_buff_len_proto; case BPF_FUNC_xdp_load_bytes: return &bpf_xdp_load_bytes_proto; case BPF_FUNC_xdp_store_bytes: return &bpf_xdp_store_bytes_proto; case BPF_FUNC_fib_lookup: return &bpf_xdp_fib_lookup_proto; case BPF_FUNC_check_mtu: return &bpf_xdp_check_mtu_proto; #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_udp: return &bpf_xdp_sk_lookup_udp_proto; case BPF_FUNC_sk_lookup_tcp: return &bpf_xdp_sk_lookup_tcp_proto; case BPF_FUNC_sk_release: return &bpf_sk_release_proto; case BPF_FUNC_skc_lookup_tcp: return &bpf_xdp_skc_lookup_tcp_proto; case BPF_FUNC_tcp_check_syncookie: return &bpf_tcp_check_syncookie_proto; case BPF_FUNC_tcp_gen_syncookie: return &bpf_tcp_gen_syncookie_proto; #ifdef CONFIG_SYN_COOKIES case BPF_FUNC_tcp_raw_gen_syncookie_ipv4: return &bpf_tcp_raw_gen_syncookie_ipv4_proto; case BPF_FUNC_tcp_raw_gen_syncookie_ipv6: return &bpf_tcp_raw_gen_syncookie_ipv6_proto; case BPF_FUNC_tcp_raw_check_syncookie_ipv4: return &bpf_tcp_raw_check_syncookie_ipv4_proto; case BPF_FUNC_tcp_raw_check_syncookie_ipv6: return &bpf_tcp_raw_check_syncookie_ipv6_proto; #endif #endif default: return bpf_sk_base_func_proto(func_id, prog); } #if IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES) /* The nf_conn___init type is used in the NF_CONNTRACK kfuncs. The * kfuncs are defined in two different modules, and we want to be able * to use them interchangeably with the same BTF type ID. Because modules * can't de-duplicate BTF IDs between each other, we need the type to be * referenced in the vmlinux BTF or the verifier will get confused about * the different types. So we add this dummy type reference which will * be included in vmlinux BTF, allowing both modules to refer to the * same type ID. */ BTF_TYPE_EMIT(struct nf_conn___init); #endif } const struct bpf_func_proto bpf_sock_map_update_proto __weak; const struct bpf_func_proto bpf_sock_hash_update_proto __weak; static const struct bpf_func_proto * sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { const struct bpf_func_proto *func_proto; func_proto = cgroup_common_func_proto(func_id, prog); if (func_proto) return func_proto; switch (func_id) { case BPF_FUNC_setsockopt: return &bpf_sock_ops_setsockopt_proto; case BPF_FUNC_getsockopt: return &bpf_sock_ops_getsockopt_proto; case BPF_FUNC_sock_ops_cb_flags_set: return &bpf_sock_ops_cb_flags_set_proto; case BPF_FUNC_sock_map_update: return &bpf_sock_map_update_proto; case BPF_FUNC_sock_hash_update: return &bpf_sock_hash_update_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_sock_ops_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; case BPF_FUNC_get_netns_cookie: return &bpf_get_netns_cookie_sock_ops_proto; #ifdef CONFIG_INET case BPF_FUNC_load_hdr_opt: return &bpf_sock_ops_load_hdr_opt_proto; case BPF_FUNC_store_hdr_opt: return &bpf_sock_ops_store_hdr_opt_proto; case BPF_FUNC_reserve_hdr_opt: return &bpf_sock_ops_reserve_hdr_opt_proto; case BPF_FUNC_tcp_sock: return &bpf_tcp_sock_proto; #endif /* CONFIG_INET */ default: return bpf_sk_base_func_proto(func_id, prog); } } const struct bpf_func_proto bpf_msg_redirect_map_proto __weak; const struct bpf_func_proto bpf_msg_redirect_hash_proto __weak; static const struct bpf_func_proto * sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_msg_redirect_map: return &bpf_msg_redirect_map_proto; case BPF_FUNC_msg_redirect_hash: return &bpf_msg_redirect_hash_proto; case BPF_FUNC_msg_apply_bytes: return &bpf_msg_apply_bytes_proto; case BPF_FUNC_msg_cork_bytes: return &bpf_msg_cork_bytes_proto; case BPF_FUNC_msg_pull_data: return &bpf_msg_pull_data_proto; case BPF_FUNC_msg_push_data: return &bpf_msg_push_data_proto; case BPF_FUNC_msg_pop_data: return &bpf_msg_pop_data_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; case BPF_FUNC_get_netns_cookie: return &bpf_get_netns_cookie_sk_msg_proto; default: return bpf_sk_base_func_proto(func_id, prog); } } const struct bpf_func_proto bpf_sk_redirect_map_proto __weak; const struct bpf_func_proto bpf_sk_redirect_hash_proto __weak; static const struct bpf_func_proto * sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_store_bytes: return &bpf_skb_store_bytes_proto; case BPF_FUNC_skb_load_bytes: return &bpf_skb_load_bytes_proto; case BPF_FUNC_skb_pull_data: return &sk_skb_pull_data_proto; case BPF_FUNC_skb_change_tail: return &sk_skb_change_tail_proto; case BPF_FUNC_skb_change_head: return &sk_skb_change_head_proto; case BPF_FUNC_skb_adjust_room: return &sk_skb_adjust_room_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_proto; case BPF_FUNC_get_socket_uid: return &bpf_get_socket_uid_proto; case BPF_FUNC_sk_redirect_map: return &bpf_sk_redirect_map_proto; case BPF_FUNC_sk_redirect_hash: return &bpf_sk_redirect_hash_proto; case BPF_FUNC_perf_event_output: return &bpf_skb_event_output_proto; #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_tcp: return &bpf_sk_lookup_tcp_proto; case BPF_FUNC_sk_lookup_udp: return &bpf_sk_lookup_udp_proto; case BPF_FUNC_sk_release: return &bpf_sk_release_proto; case BPF_FUNC_skc_lookup_tcp: return &bpf_skc_lookup_tcp_proto; #endif default: return bpf_sk_base_func_proto(func_id, prog); } } static const struct bpf_func_proto * flow_dissector_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_load_bytes: return &bpf_flow_dissector_load_bytes_proto; default: return bpf_sk_base_func_proto(func_id, prog); } } static const struct bpf_func_proto * lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_load_bytes: return &bpf_skb_load_bytes_proto; case BPF_FUNC_skb_pull_data: return &bpf_skb_pull_data_proto; case BPF_FUNC_csum_diff: return &bpf_csum_diff_proto; case BPF_FUNC_get_cgroup_classid: return &bpf_get_cgroup_classid_proto; case BPF_FUNC_get_route_realm: return &bpf_get_route_realm_proto; case BPF_FUNC_get_hash_recalc: return &bpf_get_hash_recalc_proto; case BPF_FUNC_perf_event_output: return &bpf_skb_event_output_proto; case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; case BPF_FUNC_skb_under_cgroup: return &bpf_skb_under_cgroup_proto; default: return bpf_sk_base_func_proto(func_id, prog); } } static const struct bpf_func_proto * lwt_in_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_lwt_push_encap: return &bpf_lwt_in_push_encap_proto; default: return lwt_out_func_proto(func_id, prog); } } static const struct bpf_func_proto * lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_get_tunnel_key: return &bpf_skb_get_tunnel_key_proto; case BPF_FUNC_skb_set_tunnel_key: return bpf_get_skb_set_tunnel_proto(func_id); case BPF_FUNC_skb_get_tunnel_opt: return &bpf_skb_get_tunnel_opt_proto; case BPF_FUNC_skb_set_tunnel_opt: return bpf_get_skb_set_tunnel_proto(func_id); case BPF_FUNC_redirect: return &bpf_redirect_proto; case BPF_FUNC_clone_redirect: return &bpf_clone_redirect_proto; case BPF_FUNC_skb_change_tail: return &bpf_skb_change_tail_proto; case BPF_FUNC_skb_change_head: return &bpf_skb_change_head_proto; case BPF_FUNC_skb_store_bytes: return &bpf_skb_store_bytes_proto; case BPF_FUNC_csum_update: return &bpf_csum_update_proto; case BPF_FUNC_csum_level: return &bpf_csum_level_proto; case BPF_FUNC_l3_csum_replace: return &bpf_l3_csum_replace_proto; case BPF_FUNC_l4_csum_replace: return &bpf_l4_csum_replace_proto; case BPF_FUNC_set_hash_invalid: return &bpf_set_hash_invalid_proto; case BPF_FUNC_lwt_push_encap: return &bpf_lwt_xmit_push_encap_proto; default: return lwt_out_func_proto(func_id, prog); } } static const struct bpf_func_proto * lwt_seg6local_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) case BPF_FUNC_lwt_seg6_store_bytes: return &bpf_lwt_seg6_store_bytes_proto; case BPF_FUNC_lwt_seg6_action: return &bpf_lwt_seg6_action_proto; case BPF_FUNC_lwt_seg6_adjust_srh: return &bpf_lwt_seg6_adjust_srh_proto; #endif default: return lwt_out_func_proto(func_id, prog); } } static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); if (off < 0 || off >= sizeof(struct __sk_buff)) return false; /* The verifier guarantees that size > 0. */ if (off % size != 0) return false; switch (off) { case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): if (off + size > offsetofend(struct __sk_buff, cb[4])) return false; break; case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, data_end): if (info->is_ldsx || size != size_default) return false; break; case bpf_ctx_range_till(struct __sk_buff, remote_ip6[0], remote_ip6[3]): case bpf_ctx_range_till(struct __sk_buff, local_ip6[0], local_ip6[3]): case bpf_ctx_range_till(struct __sk_buff, remote_ip4, remote_ip4): case bpf_ctx_range_till(struct __sk_buff, local_ip4, local_ip4): if (size != size_default) return false; break; case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): return false; case bpf_ctx_range(struct __sk_buff, hwtstamp): if (type == BPF_WRITE || size != sizeof(__u64)) return false; break; case bpf_ctx_range(struct __sk_buff, tstamp): if (size != sizeof(__u64)) return false; break; case bpf_ctx_range_ptr(struct __sk_buff, sk): if (type == BPF_WRITE || size != sizeof(__u64)) return false; info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL; break; case offsetof(struct __sk_buff, tstamp_type): return false; case offsetofend(struct __sk_buff, tstamp_type) ... offsetof(struct __sk_buff, hwtstamp) - 1: /* Explicitly prohibit access to padding in __sk_buff. */ return false; default: /* Only narrow read access allowed for now. */ if (type == BPF_WRITE) { if (size != size_default) return false; } else { bpf_ctx_record_field_size(info, size_default); if (!bpf_ctx_narrow_access_ok(off, size, size_default)) return false; } } return true; } static bool sk_filter_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, data_end): case bpf_ctx_range_till(struct __sk_buff, family, local_port): case bpf_ctx_range(struct __sk_buff, tstamp): case bpf_ctx_range(struct __sk_buff, wire_len): case bpf_ctx_range(struct __sk_buff, hwtstamp): return false; } if (type == BPF_WRITE) { switch (off) { case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; default: return false; } } return bpf_skb_is_valid_access(off, size, type, prog, info); } static bool cg_skb_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, wire_len): return false; case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_end): if (!bpf_token_capable(prog->aux->token, CAP_BPF)) return false; break; } if (type == BPF_WRITE) { switch (off) { case bpf_ctx_range(struct __sk_buff, mark): case bpf_ctx_range(struct __sk_buff, priority): case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; case bpf_ctx_range(struct __sk_buff, tstamp): if (!bpf_token_capable(prog->aux->token, CAP_BPF)) return false; break; default: return false; } } switch (off) { case bpf_ctx_range(struct __sk_buff, data): info->reg_type = PTR_TO_PACKET; break; case bpf_ctx_range(struct __sk_buff, data_end): info->reg_type = PTR_TO_PACKET_END; break; } return bpf_skb_is_valid_access(off, size, type, prog, info); } static bool lwt_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range_till(struct __sk_buff, family, local_port): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, tstamp): case bpf_ctx_range(struct __sk_buff, wire_len): case bpf_ctx_range(struct __sk_buff, hwtstamp): return false; } if (type == BPF_WRITE) { switch (off) { case bpf_ctx_range(struct __sk_buff, mark): case bpf_ctx_range(struct __sk_buff, priority): case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; default: return false; } } switch (off) { case bpf_ctx_range(struct __sk_buff, data): info->reg_type = PTR_TO_PACKET; break; case bpf_ctx_range(struct __sk_buff, data_end): info->reg_type = PTR_TO_PACKET_END; break; } return bpf_skb_is_valid_access(off, size, type, prog, info); } /* Attach type specific accesses */ static bool __sock_filter_check_attach_type(int off, enum bpf_access_type access_type, enum bpf_attach_type attach_type) { switch (off) { case offsetof(struct bpf_sock, bound_dev_if): case offsetof(struct bpf_sock, mark): case offsetof(struct bpf_sock, priority): switch (attach_type) { case BPF_CGROUP_INET_SOCK_CREATE: case BPF_CGROUP_INET_SOCK_RELEASE: goto full_access; default: return false; } case bpf_ctx_range(struct bpf_sock, src_ip4): switch (attach_type) { case BPF_CGROUP_INET4_POST_BIND: goto read_only; default: return false; } case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): switch (attach_type) { case BPF_CGROUP_INET6_POST_BIND: goto read_only; default: return false; } case bpf_ctx_range(struct bpf_sock, src_port): switch (attach_type) { case BPF_CGROUP_INET4_POST_BIND: case BPF_CGROUP_INET6_POST_BIND: goto read_only; default: return false; } } read_only: return access_type == BPF_READ; full_access: return true; } bool bpf_sock_common_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { switch (off) { case bpf_ctx_range_till(struct bpf_sock, type, priority): return false; default: return bpf_sock_is_valid_access(off, size, type, info); } } bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); int field_size; if (off < 0 || off >= sizeof(struct bpf_sock)) return false; if (off % size != 0) return false; switch (off) { case offsetof(struct bpf_sock, state): case offsetof(struct bpf_sock, family): case offsetof(struct bpf_sock, type): case offsetof(struct bpf_sock, protocol): case offsetof(struct bpf_sock, src_port): case offsetof(struct bpf_sock, rx_queue_mapping): case bpf_ctx_range(struct bpf_sock, src_ip4): case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): case bpf_ctx_range(struct bpf_sock, dst_ip4): case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]): bpf_ctx_record_field_size(info, size_default); return bpf_ctx_narrow_access_ok(off, size, size_default); case bpf_ctx_range(struct bpf_sock, dst_port): field_size = size == size_default ? size_default : sizeof_field(struct bpf_sock, dst_port); bpf_ctx_record_field_size(info, field_size); return bpf_ctx_narrow_access_ok(off, size, field_size); case offsetofend(struct bpf_sock, dst_port) ... offsetof(struct bpf_sock, dst_ip4) - 1: return false; } return size == size_default; } static bool sock_filter_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (!bpf_sock_is_valid_access(off, size, type, info)) return false; return __sock_filter_check_attach_type(off, type, prog->expected_attach_type); } static int bpf_noop_prologue(struct bpf_insn *insn_buf, bool direct_write, const struct bpf_prog *prog) { /* Neither direct read nor direct write requires any preliminary * action. */ return 0; } static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write, const struct bpf_prog *prog, int drop_verdict) { struct bpf_insn *insn = insn_buf; if (!direct_write) return 0; /* if (!skb->cloned) * goto start; * * (Fast-path, otherwise approximation that we might be * a clone, do the rest in helper.) */ *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET); *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_6, CLONED_MASK); *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 7); /* ret = bpf_skb_pull_data(skb, 0); */ *insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1); *insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_2, BPF_REG_2); *insn++ = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_pull_data); /* if (!ret) * goto restore; * return TC_ACT_SHOT; */ *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2); *insn++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, drop_verdict); *insn++ = BPF_EXIT_INSN(); /* restore: */ *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6); /* start: */ *insn++ = prog->insnsi[0]; return insn - insn_buf; } static int bpf_gen_ld_abs(const struct bpf_insn *orig, struct bpf_insn *insn_buf) { bool indirect = BPF_MODE(orig->code) == BPF_IND; struct bpf_insn *insn = insn_buf; if (!indirect) { *insn++ = BPF_MOV64_IMM(BPF_REG_2, orig->imm); } else { *insn++ = BPF_MOV64_REG(BPF_REG_2, orig->src_reg); if (orig->imm) *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, orig->imm); } /* We're guaranteed here that CTX is in R6. */ *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_CTX); switch (BPF_SIZE(orig->code)) { case BPF_B: *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8_no_cache); break; case BPF_H: *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16_no_cache); break; case BPF_W: *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32_no_cache); break; } *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 2); *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_0, BPF_REG_0); *insn++ = BPF_EXIT_INSN(); return insn - insn_buf; } static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, const struct bpf_prog *prog) { return bpf_unclone_prologue(insn_buf, direct_write, prog, TC_ACT_SHOT); } static bool tc_cls_act_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (type == BPF_WRITE) { switch (off) { case bpf_ctx_range(struct __sk_buff, mark): case bpf_ctx_range(struct __sk_buff, tc_index): case bpf_ctx_range(struct __sk_buff, priority): case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): case bpf_ctx_range(struct __sk_buff, tstamp): case bpf_ctx_range(struct __sk_buff, queue_mapping): break; default: return false; } } switch (off) { case bpf_ctx_range(struct __sk_buff, data): info->reg_type = PTR_TO_PACKET; break; case bpf_ctx_range(struct __sk_buff, data_meta): info->reg_type = PTR_TO_PACKET_META; break; case bpf_ctx_range(struct __sk_buff, data_end): info->reg_type = PTR_TO_PACKET_END; break; case bpf_ctx_range_till(struct __sk_buff, family, local_port): return false; case offsetof(struct __sk_buff, tstamp_type): /* The convert_ctx_access() on reading and writing * __sk_buff->tstamp depends on whether the bpf prog * has used __sk_buff->tstamp_type or not. * Thus, we need to set prog->tstamp_type_access * earlier during is_valid_access() here. */ ((struct bpf_prog *)prog)->tstamp_type_access = 1; return size == sizeof(__u8); } return bpf_skb_is_valid_access(off, size, type, prog, info); } DEFINE_MUTEX(nf_conn_btf_access_lock); EXPORT_SYMBOL_GPL(nf_conn_btf_access_lock); int (*nfct_btf_struct_access)(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, int off, int size); EXPORT_SYMBOL_GPL(nfct_btf_struct_access); static int tc_cls_act_btf_struct_access(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, int off, int size) { int ret = -EACCES; mutex_lock(&nf_conn_btf_access_lock); if (nfct_btf_struct_access) ret = nfct_btf_struct_access(log, reg, off, size); mutex_unlock(&nf_conn_btf_access_lock); return ret; } static bool __is_valid_xdp_access(int off, int size) { if (off < 0 || off >= sizeof(struct xdp_md)) return false; if (off % size != 0) return false; if (size != sizeof(__u32)) return false; return true; } static bool xdp_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (prog->expected_attach_type != BPF_XDP_DEVMAP) { switch (off) { case offsetof(struct xdp_md, egress_ifindex): return false; } } if (type == BPF_WRITE) { if (bpf_prog_is_offloaded(prog->aux)) { switch (off) { case offsetof(struct xdp_md, rx_queue_index): return __is_valid_xdp_access(off, size); } } return false; } else { switch (off) { case offsetof(struct xdp_md, data_meta): case offsetof(struct xdp_md, data): case offsetof(struct xdp_md, data_end): if (info->is_ldsx) return false; } } switch (off) { case offsetof(struct xdp_md, data): info->reg_type = PTR_TO_PACKET; break; case offsetof(struct xdp_md, data_meta): info->reg_type = PTR_TO_PACKET_META; break; case offsetof(struct xdp_md, data_end): info->reg_type = PTR_TO_PACKET_END; break; } return __is_valid_xdp_access(off, size); } void bpf_warn_invalid_xdp_action(const struct net_device *dev, const struct bpf_prog *prog, u32 act) { const u32 act_max = XDP_REDIRECT; pr_warn_once("%s XDP return value %u on prog %s (id %d) dev %s, expect packet loss!\n", act > act_max ? "Illegal" : "Driver unsupported", act, prog->aux->name, prog->aux->id, dev ? dev->name : "N/A"); } EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); static int xdp_btf_struct_access(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, int off, int size) { int ret = -EACCES; mutex_lock(&nf_conn_btf_access_lock); if (nfct_btf_struct_access) ret = nfct_btf_struct_access(log, reg, off, size); mutex_unlock(&nf_conn_btf_access_lock); return ret; } static bool sock_addr_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); if (off < 0 || off >= sizeof(struct bpf_sock_addr)) return false; if (off % size != 0) return false; /* Disallow access to fields not belonging to the attach type's address * family. */ switch (off) { case bpf_ctx_range(struct bpf_sock_addr, user_ip4): switch (prog->expected_attach_type) { case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET4_GETPEERNAME: case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: break; default: return false; } break; case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): switch (prog->expected_attach_type) { case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_INET6_GETPEERNAME: case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UDP6_RECVMSG: break; default: return false; } break; case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4): switch (prog->expected_attach_type) { case BPF_CGROUP_UDP4_SENDMSG: break; default: return false; } break; case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], msg_src_ip6[3]): switch (prog->expected_attach_type) { case BPF_CGROUP_UDP6_SENDMSG: break; default: return false; } break; } switch (off) { case bpf_ctx_range(struct bpf_sock_addr, user_ip4): case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4): case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], msg_src_ip6[3]): case bpf_ctx_range(struct bpf_sock_addr, user_port): if (type == BPF_READ) { bpf_ctx_record_field_size(info, size_default); if (bpf_ctx_wide_access_ok(off, size, struct bpf_sock_addr, user_ip6)) return true; if (bpf_ctx_wide_access_ok(off, size, struct bpf_sock_addr, msg_src_ip6)) return true; if (!bpf_ctx_narrow_access_ok(off, size, size_default)) return false; } else { if (bpf_ctx_wide_access_ok(off, size, struct bpf_sock_addr, user_ip6)) return true; if (bpf_ctx_wide_access_ok(off, size, struct bpf_sock_addr, msg_src_ip6)) return true; if (size != size_default) return false; } break; case bpf_ctx_range_ptr(struct bpf_sock_addr, sk): if (type != BPF_READ) return false; if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_SOCKET; break; case bpf_ctx_range(struct bpf_sock_addr, user_family): case bpf_ctx_range(struct bpf_sock_addr, family): case bpf_ctx_range(struct bpf_sock_addr, type): case bpf_ctx_range(struct bpf_sock_addr, protocol): if (type != BPF_READ) return false; if (size != size_default) return false; break; default: return false; } return true; } static bool sock_ops_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); if (off < 0 || off >= sizeof(struct bpf_sock_ops)) return false; /* The verifier guarantees that size > 0. */ if (off % size != 0) return false; if (type == BPF_WRITE) { switch (off) { case offsetof(struct bpf_sock_ops, reply): case offsetof(struct bpf_sock_ops, sk_txhash): if (size != size_default) return false; break; default: return false; } } else { switch (off) { case bpf_ctx_range_till(struct bpf_sock_ops, bytes_received, bytes_acked): if (size != sizeof(__u64)) return false; break; case bpf_ctx_range_ptr(struct bpf_sock_ops, sk): if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_SOCKET_OR_NULL; break; case bpf_ctx_range_ptr(struct bpf_sock_ops, skb_data): if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_PACKET; break; case bpf_ctx_range_ptr(struct bpf_sock_ops, skb_data_end): if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_PACKET_END; break; case offsetof(struct bpf_sock_ops, skb_tcp_flags): bpf_ctx_record_field_size(info, size_default); return bpf_ctx_narrow_access_ok(off, size, size_default); case bpf_ctx_range(struct bpf_sock_ops, skb_hwtstamp): if (size != sizeof(__u64)) return false; break; default: if (size != size_default) return false; break; } } return true; } static int sk_skb_prologue(struct bpf_insn *insn_buf, bool direct_write, const struct bpf_prog *prog) { return bpf_unclone_prologue(insn_buf, direct_write, prog, SK_DROP); } static bool sk_skb_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, tstamp): case bpf_ctx_range(struct __sk_buff, wire_len): case bpf_ctx_range(struct __sk_buff, hwtstamp): return false; } if (type == BPF_WRITE) { switch (off) { case bpf_ctx_range(struct __sk_buff, tc_index): case bpf_ctx_range(struct __sk_buff, priority): break; default: return false; } } switch (off) { case bpf_ctx_range(struct __sk_buff, mark): return false; case bpf_ctx_range(struct __sk_buff, data): info->reg_type = PTR_TO_PACKET; break; case bpf_ctx_range(struct __sk_buff, data_end): info->reg_type = PTR_TO_PACKET_END; break; } return bpf_skb_is_valid_access(off, size, type, prog, info); } static bool sk_msg_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (type == BPF_WRITE) return false; if (off % size != 0) return false; switch (off) { case bpf_ctx_range_ptr(struct sk_msg_md, data): info->reg_type = PTR_TO_PACKET; if (size != sizeof(__u64)) return false; break; case bpf_ctx_range_ptr(struct sk_msg_md, data_end): info->reg_type = PTR_TO_PACKET_END; if (size != sizeof(__u64)) return false; break; case bpf_ctx_range_ptr(struct sk_msg_md, sk): if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_SOCKET; break; case bpf_ctx_range(struct sk_msg_md, family): case bpf_ctx_range(struct sk_msg_md, remote_ip4): case bpf_ctx_range(struct sk_msg_md, local_ip4): case bpf_ctx_range_till(struct sk_msg_md, remote_ip6[0], remote_ip6[3]): case bpf_ctx_range_till(struct sk_msg_md, local_ip6[0], local_ip6[3]): case bpf_ctx_range(struct sk_msg_md, remote_port): case bpf_ctx_range(struct sk_msg_md, local_port): case bpf_ctx_range(struct sk_msg_md, size): if (size != sizeof(__u32)) return false; break; default: return false; } return true; } static bool flow_dissector_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); if (off < 0 || off >= sizeof(struct __sk_buff)) return false; if (off % size != 0) return false; if (type == BPF_WRITE) return false; switch (off) { case bpf_ctx_range(struct __sk_buff, data): if (info->is_ldsx || size != size_default) return false; info->reg_type = PTR_TO_PACKET; return true; case bpf_ctx_range(struct __sk_buff, data_end): if (info->is_ldsx || size != size_default) return false; info->reg_type = PTR_TO_PACKET_END; return true; case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_FLOW_KEYS; return true; default: return false; } } static u32 flow_dissector_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; switch (si->off) { case offsetof(struct __sk_buff, data): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, data), si->dst_reg, si->src_reg, offsetof(struct bpf_flow_dissector, data)); break; case offsetof(struct __sk_buff, data_end): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, data_end), si->dst_reg, si->src_reg, offsetof(struct bpf_flow_dissector, data_end)); break; case offsetof(struct __sk_buff, flow_keys): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, flow_keys), si->dst_reg, si->src_reg, offsetof(struct bpf_flow_dissector, flow_keys)); break; } return insn - insn_buf; } static struct bpf_insn *bpf_convert_tstamp_type_read(const struct bpf_insn *si, struct bpf_insn *insn) { __u8 value_reg = si->dst_reg; __u8 skb_reg = si->src_reg; BUILD_BUG_ON(__SKB_CLOCK_MAX != (int)BPF_SKB_CLOCK_TAI); BUILD_BUG_ON(SKB_CLOCK_REALTIME != (int)BPF_SKB_CLOCK_REALTIME); BUILD_BUG_ON(SKB_CLOCK_MONOTONIC != (int)BPF_SKB_CLOCK_MONOTONIC); BUILD_BUG_ON(SKB_CLOCK_TAI != (int)BPF_SKB_CLOCK_TAI); *insn++ = BPF_LDX_MEM(BPF_B, value_reg, skb_reg, SKB_BF_MONO_TC_OFFSET); *insn++ = BPF_ALU32_IMM(BPF_AND, value_reg, SKB_TSTAMP_TYPE_MASK); #ifdef __BIG_ENDIAN_BITFIELD *insn++ = BPF_ALU32_IMM(BPF_RSH, value_reg, SKB_TSTAMP_TYPE_RSHIFT); #else BUILD_BUG_ON(!(SKB_TSTAMP_TYPE_MASK & 0x1)); #endif return insn; } static struct bpf_insn *bpf_convert_shinfo_access(__u8 dst_reg, __u8 skb_reg, struct bpf_insn *insn) { /* si->dst_reg = skb_shinfo(SKB); */ #ifdef NET_SKBUFF_DATA_USES_OFFSET *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end), BPF_REG_AX, skb_reg, offsetof(struct sk_buff, end)); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, head), dst_reg, skb_reg, offsetof(struct sk_buff, head)); *insn++ = BPF_ALU64_REG(BPF_ADD, dst_reg, BPF_REG_AX); #else *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end), dst_reg, skb_reg, offsetof(struct sk_buff, end)); #endif return insn; } static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog, const struct bpf_insn *si, struct bpf_insn *insn) { __u8 value_reg = si->dst_reg; __u8 skb_reg = si->src_reg; #ifdef CONFIG_NET_XGRESS /* If the tstamp_type is read, * the bpf prog is aware the tstamp could have delivery time. * Thus, read skb->tstamp as is if tstamp_type_access is true. */ if (!prog->tstamp_type_access) { /* AX is needed because src_reg and dst_reg could be the same */ __u8 tmp_reg = BPF_REG_AX; *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET); /* check if ingress mask bits is set */ *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1); *insn++ = BPF_JMP_A(4); *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, SKB_TSTAMP_TYPE_MASK, 1); *insn++ = BPF_JMP_A(2); /* skb->tc_at_ingress && skb->tstamp_type, * read 0 as the (rcv) timestamp. */ *insn++ = BPF_MOV64_IMM(value_reg, 0); *insn++ = BPF_JMP_A(1); } #endif *insn++ = BPF_LDX_MEM(BPF_DW, value_reg, skb_reg, offsetof(struct sk_buff, tstamp)); return insn; } static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog, const struct bpf_insn *si, struct bpf_insn *insn) { __u8 value_reg = si->src_reg; __u8 skb_reg = si->dst_reg; #ifdef CONFIG_NET_XGRESS /* If the tstamp_type is read, * the bpf prog is aware the tstamp could have delivery time. * Thus, write skb->tstamp as is if tstamp_type_access is true. * Otherwise, writing at ingress will have to clear the * skb->tstamp_type bit also. */ if (!prog->tstamp_type_access) { __u8 tmp_reg = BPF_REG_AX; *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET); /* Writing __sk_buff->tstamp as ingress, goto <clear> */ *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1); /* goto <store> */ *insn++ = BPF_JMP_A(2); /* <clear>: skb->tstamp_type */ *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_TSTAMP_TYPE_MASK); *insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg, SKB_BF_MONO_TC_OFFSET); } #endif /* <store>: skb->tstamp = tstamp */ *insn++ = BPF_RAW_INSN(BPF_CLASS(si->code) | BPF_DW | BPF_MEM, skb_reg, value_reg, offsetof(struct sk_buff, tstamp), si->imm); return insn; } #define BPF_EMIT_STORE(size, si, off) \ BPF_RAW_INSN(BPF_CLASS((si)->code) | (size) | BPF_MEM, \ (si)->dst_reg, (si)->src_reg, (off), (si)->imm) static u32 bpf_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; int off; switch (si->off) { case offsetof(struct __sk_buff, len): *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, len, 4, target_size)); break; case offsetof(struct __sk_buff, protocol): *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, protocol, 2, target_size)); break; case offsetof(struct __sk_buff, vlan_proto): *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, vlan_proto, 2, target_size)); break; case offsetof(struct __sk_buff, priority): if (type == BPF_WRITE) *insn++ = BPF_EMIT_STORE(BPF_W, si, bpf_target_off(struct sk_buff, priority, 4, target_size)); else *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, priority, 4, target_size)); break; case offsetof(struct __sk_buff, ingress_ifindex): *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, skb_iif, 4, target_size)); break; case offsetof(struct __sk_buff, ifindex): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), si->dst_reg, si->src_reg, offsetof(struct sk_buff, dev)); *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, bpf_target_off(struct net_device, ifindex, 4, target_size)); break; case offsetof(struct __sk_buff, hash): *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, hash, 4, target_size)); break; case offsetof(struct __sk_buff, mark): if (type == BPF_WRITE) *insn++ = BPF_EMIT_STORE(BPF_W, si, bpf_target_off(struct sk_buff, mark, 4, target_size)); else *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, mark, 4, target_size)); break; case offsetof(struct __sk_buff, pkt_type): *target_size = 1; *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg, PKT_TYPE_OFFSET); *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, PKT_TYPE_MAX); #ifdef __BIG_ENDIAN_BITFIELD *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 5); #endif break; case offsetof(struct __sk_buff, queue_mapping): if (type == BPF_WRITE) { u32 offset = bpf_target_off(struct sk_buff, queue_mapping, 2, target_size); if (BPF_CLASS(si->code) == BPF_ST && si->imm >= NO_QUEUE_MAPPING) { *insn++ = BPF_JMP_A(0); /* noop */ break; } if (BPF_CLASS(si->code) == BPF_STX) *insn++ = BPF_JMP_IMM(BPF_JGE, si->src_reg, NO_QUEUE_MAPPING, 1); *insn++ = BPF_EMIT_STORE(BPF_H, si, offset); } else { *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, queue_mapping, 2, target_size)); } break; case offsetof(struct __sk_buff, vlan_present): *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, vlan_all, 4, target_size)); *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); *insn++ = BPF_ALU32_IMM(BPF_MOV, si->dst_reg, 1); break; case offsetof(struct __sk_buff, vlan_tci): *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, vlan_tci, 2, target_size)); break; case offsetof(struct __sk_buff, cb[0]) ... offsetofend(struct __sk_buff, cb[4]) - 1: BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, data) < 20); BUILD_BUG_ON((offsetof(struct sk_buff, cb) + offsetof(struct qdisc_skb_cb, data)) % sizeof(__u64)); prog->cb_access = 1; off = si->off; off -= offsetof(struct __sk_buff, cb[0]); off += offsetof(struct sk_buff, cb); off += offsetof(struct qdisc_skb_cb, data); if (type == BPF_WRITE) *insn++ = BPF_EMIT_STORE(BPF_SIZE(si->code), si, off); else *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg, si->src_reg, off); break; case offsetof(struct __sk_buff, tc_classid): BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, tc_classid) != 2); off = si->off; off -= offsetof(struct __sk_buff, tc_classid); off += offsetof(struct sk_buff, cb); off += offsetof(struct qdisc_skb_cb, tc_classid); *target_size = 2; if (type == BPF_WRITE) *insn++ = BPF_EMIT_STORE(BPF_H, si, off); else *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, off); break; case offsetof(struct __sk_buff, data): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), si->dst_reg, si->src_reg, offsetof(struct sk_buff, data)); break; case offsetof(struct __sk_buff, data_meta): off = si->off; off -= offsetof(struct __sk_buff, data_meta); off += offsetof(struct sk_buff, cb); off += offsetof(struct bpf_skb_data_end, data_meta); *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, off); break; case offsetof(struct __sk_buff, data_end): off = si->off; off -= offsetof(struct __sk_buff, data_end); off += offsetof(struct sk_buff, cb); off += offsetof(struct bpf_skb_data_end, data_end); *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, off); break; case offsetof(struct __sk_buff, tc_index): #ifdef CONFIG_NET_SCHED if (type == BPF_WRITE) *insn++ = BPF_EMIT_STORE(BPF_H, si, bpf_target_off(struct sk_buff, tc_index, 2, target_size)); else *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, tc_index, 2, target_size)); #else *target_size = 2; if (type == BPF_WRITE) *insn++ = BPF_MOV64_REG(si->dst_reg, si->dst_reg); else *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); #endif break; case offsetof(struct __sk_buff, napi_id): #if defined(CONFIG_NET_RX_BUSY_POLL) *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, napi_id, 4, target_size)); *insn++ = BPF_JMP_IMM(BPF_JGE, si->dst_reg, MIN_NAPI_ID, 1); *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); #else *target_size = 4; *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); #endif break; case offsetof(struct __sk_buff, family): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), si->dst_reg, si->src_reg, offsetof(struct sk_buff, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, bpf_target_off(struct sock_common, skc_family, 2, target_size)); break; case offsetof(struct __sk_buff, remote_ip4): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), si->dst_reg, si->src_reg, offsetof(struct sk_buff, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, bpf_target_off(struct sock_common, skc_daddr, 4, target_size)); break; case offsetof(struct __sk_buff, local_ip4): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_rcv_saddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), si->dst_reg, si->src_reg, offsetof(struct sk_buff, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, bpf_target_off(struct sock_common, skc_rcv_saddr, 4, target_size)); break; case offsetof(struct __sk_buff, remote_ip6[0]) ... offsetof(struct __sk_buff, remote_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) BUILD_BUG_ON(sizeof_field(struct sock_common, skc_v6_daddr.s6_addr32[0]) != 4); off = si->off; off -= offsetof(struct __sk_buff, remote_ip6[0]); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), si->dst_reg, si->src_reg, offsetof(struct sk_buff, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_v6_daddr.s6_addr32[0]) + off); #else *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); #endif break; case offsetof(struct __sk_buff, local_ip6[0]) ... offsetof(struct __sk_buff, local_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) BUILD_BUG_ON(sizeof_field(struct sock_common, skc_v6_rcv_saddr.s6_addr32[0]) != 4); off = si->off; off -= offsetof(struct __sk_buff, local_ip6[0]); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), si->dst_reg, si->src_reg, offsetof(struct sk_buff, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_v6_rcv_saddr.s6_addr32[0]) + off); #else *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); #endif break; case offsetof(struct __sk_buff, remote_port): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), si->dst_reg, si->src_reg, offsetof(struct sk_buff, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, bpf_target_off(struct sock_common, skc_dport, 2, target_size)); #ifndef __BIG_ENDIAN_BITFIELD *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16); #endif break; case offsetof(struct __sk_buff, local_port): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), si->dst_reg, si->src_reg, offsetof(struct sk_buff, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, bpf_target_off(struct sock_common, skc_num, 2, target_size)); break; case offsetof(struct __sk_buff, tstamp): BUILD_BUG_ON(sizeof_field(struct sk_buff, tstamp) != 8); if (type == BPF_WRITE) insn = bpf_convert_tstamp_write(prog, si, insn); else insn = bpf_convert_tstamp_read(prog, si, insn); break; case offsetof(struct __sk_buff, tstamp_type): insn = bpf_convert_tstamp_type_read(si, insn); break; case offsetof(struct __sk_buff, gso_segs): insn = bpf_convert_shinfo_access(si->dst_reg, si->src_reg, insn); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_segs), si->dst_reg, si->dst_reg, bpf_target_off(struct skb_shared_info, gso_segs, 2, target_size)); break; case offsetof(struct __sk_buff, gso_size): insn = bpf_convert_shinfo_access(si->dst_reg, si->src_reg, insn); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_size), si->dst_reg, si->dst_reg, bpf_target_off(struct skb_shared_info, gso_size, 2, target_size)); break; case offsetof(struct __sk_buff, wire_len): BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, pkt_len) != 4); off = si->off; off -= offsetof(struct __sk_buff, wire_len); off += offsetof(struct sk_buff, cb); off += offsetof(struct qdisc_skb_cb, pkt_len); *target_size = 4; *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, off); break; case offsetof(struct __sk_buff, sk): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), si->dst_reg, si->src_reg, offsetof(struct sk_buff, sk)); break; case offsetof(struct __sk_buff, hwtstamp): BUILD_BUG_ON(sizeof_field(struct skb_shared_hwtstamps, hwtstamp) != 8); BUILD_BUG_ON(offsetof(struct skb_shared_hwtstamps, hwtstamp) != 0); insn = bpf_convert_shinfo_access(si->dst_reg, si->src_reg, insn); *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg, bpf_target_off(struct skb_shared_info, hwtstamps, 8, target_size)); break; } return insn - insn_buf; } u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; int off; switch (si->off) { case offsetof(struct bpf_sock, bound_dev_if): BUILD_BUG_ON(sizeof_field(struct sock, sk_bound_dev_if) != 4); if (type == BPF_WRITE) *insn++ = BPF_EMIT_STORE(BPF_W, si, offsetof(struct sock, sk_bound_dev_if)); else *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sock, sk_bound_dev_if)); break; case offsetof(struct bpf_sock, mark): BUILD_BUG_ON(sizeof_field(struct sock, sk_mark) != 4); if (type == BPF_WRITE) *insn++ = BPF_EMIT_STORE(BPF_W, si, offsetof(struct sock, sk_mark)); else *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sock, sk_mark)); break; case offsetof(struct bpf_sock, priority): BUILD_BUG_ON(sizeof_field(struct sock, sk_priority) != 4); if (type == BPF_WRITE) *insn++ = BPF_EMIT_STORE(BPF_W, si, offsetof(struct sock, sk_priority)); else *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sock, sk_priority)); break; case offsetof(struct bpf_sock, family): *insn++ = BPF_LDX_MEM( BPF_FIELD_SIZEOF(struct sock_common, skc_family), si->dst_reg, si->src_reg, bpf_target_off(struct sock_common, skc_family, sizeof_field(struct sock_common, skc_family), target_size)); break; case offsetof(struct bpf_sock, type): *insn++ = BPF_LDX_MEM( BPF_FIELD_SIZEOF(struct sock, sk_type), si->dst_reg, si->src_reg, bpf_target_off(struct sock, sk_type, sizeof_field(struct sock, sk_type), target_size)); break; case offsetof(struct bpf_sock, protocol): *insn++ = BPF_LDX_MEM( BPF_FIELD_SIZEOF(struct sock, sk_protocol), si->dst_reg, si->src_reg, bpf_target_off(struct sock, sk_protocol, sizeof_field(struct sock, sk_protocol), target_size)); break; case offsetof(struct bpf_sock, src_ip4): *insn++ = BPF_LDX_MEM( BPF_SIZE(si->code), si->dst_reg, si->src_reg, bpf_target_off(struct sock_common, skc_rcv_saddr, sizeof_field(struct sock_common, skc_rcv_saddr), target_size)); break; case offsetof(struct bpf_sock, dst_ip4): *insn++ = BPF_LDX_MEM( BPF_SIZE(si->code), si->dst_reg, si->src_reg, bpf_target_off(struct sock_common, skc_daddr, sizeof_field(struct sock_common, skc_daddr), target_size)); break; case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) off = si->off; off -= offsetof(struct bpf_sock, src_ip6[0]); *insn++ = BPF_LDX_MEM( BPF_SIZE(si->code), si->dst_reg, si->src_reg, bpf_target_off( struct sock_common, skc_v6_rcv_saddr.s6_addr32[0], sizeof_field(struct sock_common, skc_v6_rcv_saddr.s6_addr32[0]), target_size) + off); #else (void)off; *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); #endif break; case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) off = si->off; off -= offsetof(struct bpf_sock, dst_ip6[0]); *insn++ = BPF_LDX_MEM( BPF_SIZE(si->code), si->dst_reg, si->src_reg, bpf_target_off(struct sock_common, skc_v6_daddr.s6_addr32[0], sizeof_field(struct sock_common, skc_v6_daddr.s6_addr32[0]), target_size) + off); #else *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); *target_size = 4; #endif break; case offsetof(struct bpf_sock, src_port): *insn++ = BPF_LDX_MEM( BPF_FIELD_SIZEOF(struct sock_common, skc_num), si->dst_reg, si->src_reg, bpf_target_off(struct sock_common, skc_num, sizeof_field(struct sock_common, skc_num), target_size)); break; case offsetof(struct bpf_sock, dst_port): *insn++ = BPF_LDX_MEM( BPF_FIELD_SIZEOF(struct sock_common, skc_dport), si->dst_reg, si->src_reg, bpf_target_off(struct sock_common, skc_dport, sizeof_field(struct sock_common, skc_dport), target_size)); break; case offsetof(struct bpf_sock, state): *insn++ = BPF_LDX_MEM( BPF_FIELD_SIZEOF(struct sock_common, skc_state), si->dst_reg, si->src_reg, bpf_target_off(struct sock_common, skc_state, sizeof_field(struct sock_common, skc_state), target_size)); break; case offsetof(struct bpf_sock, rx_queue_mapping): #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING *insn++ = BPF_LDX_MEM( BPF_FIELD_SIZEOF(struct sock, sk_rx_queue_mapping), si->dst_reg, si->src_reg, bpf_target_off(struct sock, sk_rx_queue_mapping, sizeof_field(struct sock, sk_rx_queue_mapping), target_size)); *insn++ = BPF_JMP_IMM(BPF_JNE, si->dst_reg, NO_QUEUE_MAPPING, 1); *insn++ = BPF_MOV64_IMM(si->dst_reg, -1); #else *insn++ = BPF_MOV64_IMM(si->dst_reg, -1); *target_size = 2; #endif break; } return insn - insn_buf; } static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; switch (si->off) { case offsetof(struct __sk_buff, ifindex): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), si->dst_reg, si->src_reg, offsetof(struct sk_buff, dev)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, bpf_target_off(struct net_device, ifindex, 4, target_size)); break; default: return bpf_convert_ctx_access(type, si, insn_buf, prog, target_size); } return insn - insn_buf; } static u32 xdp_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; switch (si->off) { case offsetof(struct xdp_md, data): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data), si->dst_reg, si->src_reg, offsetof(struct xdp_buff, data)); break; case offsetof(struct xdp_md, data_meta): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_meta), si->dst_reg, si->src_reg, offsetof(struct xdp_buff, data_meta)); break; case offsetof(struct xdp_md, data_end): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end), si->dst_reg, si->src_reg, offsetof(struct xdp_buff, data_end)); break; case offsetof(struct xdp_md, ingress_ifindex): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq), si->dst_reg, si->src_reg, offsetof(struct xdp_buff, rxq)); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_rxq_info, dev), si->dst_reg, si->dst_reg, offsetof(struct xdp_rxq_info, dev)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct net_device, ifindex)); break; case offsetof(struct xdp_md, rx_queue_index): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq), si->dst_reg, si->src_reg, offsetof(struct xdp_buff, rxq)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct xdp_rxq_info, queue_index)); break; case offsetof(struct xdp_md, egress_ifindex): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, txq), si->dst_reg, si->src_reg, offsetof(struct xdp_buff, txq)); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_txq_info, dev), si->dst_reg, si->dst_reg, offsetof(struct xdp_txq_info, dev)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct net_device, ifindex)); break; } return insn - insn_buf; } /* SOCK_ADDR_LOAD_NESTED_FIELD() loads Nested Field S.F.NF where S is type of * context Structure, F is Field in context structure that contains a pointer * to Nested Structure of type NS that has the field NF. * * SIZE encodes the load size (BPF_B, BPF_H, etc). It's up to caller to make * sure that SIZE is not greater than actual size of S.F.NF. * * If offset OFF is provided, the load happens from that offset relative to * offset of NF. */ #define SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF) \ do { \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), si->dst_reg, \ si->src_reg, offsetof(S, F)); \ *insn++ = BPF_LDX_MEM( \ SIZE, si->dst_reg, si->dst_reg, \ bpf_target_off(NS, NF, sizeof_field(NS, NF), \ target_size) \ + OFF); \ } while (0) #define SOCK_ADDR_LOAD_NESTED_FIELD(S, NS, F, NF) \ SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, \ BPF_FIELD_SIZEOF(NS, NF), 0) /* SOCK_ADDR_STORE_NESTED_FIELD_OFF() has semantic similar to * SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF() but for store operation. * * In addition it uses Temporary Field TF (member of struct S) as the 3rd * "register" since two registers available in convert_ctx_access are not * enough: we can't override neither SRC, since it contains value to store, nor * DST since it contains pointer to context that may be used by later * instructions. But we need a temporary place to save pointer to nested * structure whose field we want to store to. */ #define SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, SIZE, OFF, TF) \ do { \ int tmp_reg = BPF_REG_9; \ if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg) \ --tmp_reg; \ if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg) \ --tmp_reg; \ *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, tmp_reg, \ offsetof(S, TF)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), tmp_reg, \ si->dst_reg, offsetof(S, F)); \ *insn++ = BPF_RAW_INSN(SIZE | BPF_MEM | BPF_CLASS(si->code), \ tmp_reg, si->src_reg, \ bpf_target_off(NS, NF, sizeof_field(NS, NF), \ target_size) \ + OFF, \ si->imm); \ *insn++ = BPF_LDX_MEM(BPF_DW, tmp_reg, si->dst_reg, \ offsetof(S, TF)); \ } while (0) #define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF, \ TF) \ do { \ if (type == BPF_WRITE) { \ SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, SIZE, \ OFF, TF); \ } else { \ SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( \ S, NS, F, NF, SIZE, OFF); \ } \ } while (0) static u32 sock_addr_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { int off, port_size = sizeof_field(struct sockaddr_in6, sin6_port); struct bpf_insn *insn = insn_buf; switch (si->off) { case offsetof(struct bpf_sock_addr, user_family): SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, struct sockaddr, uaddr, sa_family); break; case offsetof(struct bpf_sock_addr, user_ip4): SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( struct bpf_sock_addr_kern, struct sockaddr_in, uaddr, sin_addr, BPF_SIZE(si->code), 0, tmp_reg); break; case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): off = si->off; off -= offsetof(struct bpf_sock_addr, user_ip6[0]); SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr, sin6_addr.s6_addr32[0], BPF_SIZE(si->code), off, tmp_reg); break; case offsetof(struct bpf_sock_addr, user_port): /* To get port we need to know sa_family first and then treat * sockaddr as either sockaddr_in or sockaddr_in6. * Though we can simplify since port field has same offset and * size in both structures. * Here we check this invariant and use just one of the * structures if it's true. */ BUILD_BUG_ON(offsetof(struct sockaddr_in, sin_port) != offsetof(struct sockaddr_in6, sin6_port)); BUILD_BUG_ON(sizeof_field(struct sockaddr_in, sin_port) != sizeof_field(struct sockaddr_in6, sin6_port)); /* Account for sin6_port being smaller than user_port. */ port_size = min(port_size, BPF_LDST_BYTES(si)); SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr, sin6_port, bytes_to_bpf_size(port_size), 0, tmp_reg); break; case offsetof(struct bpf_sock_addr, family): SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, struct sock, sk, sk_family); break; case offsetof(struct bpf_sock_addr, type): SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, struct sock, sk, sk_type); break; case offsetof(struct bpf_sock_addr, protocol): SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, struct sock, sk, sk_protocol); break; case offsetof(struct bpf_sock_addr, msg_src_ip4): /* Treat t_ctx as struct in_addr for msg_src_ip4. */ SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( struct bpf_sock_addr_kern, struct in_addr, t_ctx, s_addr, BPF_SIZE(si->code), 0, tmp_reg); break; case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], msg_src_ip6[3]): off = si->off; off -= offsetof(struct bpf_sock_addr, msg_src_ip6[0]); /* Treat t_ctx as struct in6_addr for msg_src_ip6. */ SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( struct bpf_sock_addr_kern, struct in6_addr, t_ctx, s6_addr32[0], BPF_SIZE(si->code), off, tmp_reg); break; case offsetof(struct bpf_sock_addr, sk): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_addr_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_addr_kern, sk)); break; } return insn - insn_buf; } static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; int off; /* Helper macro for adding read access to tcp_sock or sock fields. */ #define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ do { \ int fullsock_reg = si->dst_reg, reg = BPF_REG_9, jmp = 2; \ BUILD_BUG_ON(sizeof_field(OBJ, OBJ_FIELD) > \ sizeof_field(struct bpf_sock_ops, BPF_FIELD)); \ if (si->dst_reg == reg || si->src_reg == reg) \ reg--; \ if (si->dst_reg == reg || si->src_reg == reg) \ reg--; \ if (si->dst_reg == si->src_reg) { \ *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg, \ offsetof(struct bpf_sock_ops_kern, \ temp)); \ fullsock_reg = reg; \ jmp += 2; \ } \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, \ is_locked_tcp_sock), \ fullsock_reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, \ is_locked_tcp_sock)); \ *insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp); \ if (si->dst_reg == si->src_reg) \ *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, \ temp)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, sk),\ si->dst_reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, sk));\ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(OBJ, \ OBJ_FIELD), \ si->dst_reg, si->dst_reg, \ offsetof(OBJ, OBJ_FIELD)); \ if (si->dst_reg == si->src_reg) { \ *insn++ = BPF_JMP_A(1); \ *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, \ temp)); \ } \ } while (0) #define SOCK_OPS_GET_SK() \ do { \ int fullsock_reg = si->dst_reg, reg = BPF_REG_9, jmp = 1; \ if (si->dst_reg == reg || si->src_reg == reg) \ reg--; \ if (si->dst_reg == reg || si->src_reg == reg) \ reg--; \ if (si->dst_reg == si->src_reg) { \ *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg, \ offsetof(struct bpf_sock_ops_kern, \ temp)); \ fullsock_reg = reg; \ jmp += 2; \ } \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, \ is_fullsock), \ fullsock_reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, \ is_fullsock)); \ *insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp); \ if (si->dst_reg == si->src_reg) \ *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, \ temp)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, sk),\ si->dst_reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, sk));\ if (si->dst_reg == si->src_reg) { \ *insn++ = BPF_JMP_A(1); \ *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, \ temp)); \ } \ } while (0) #define SOCK_OPS_GET_TCP_SOCK_FIELD(FIELD) \ SOCK_OPS_GET_FIELD(FIELD, FIELD, struct tcp_sock) /* Helper macro for adding write access to tcp_sock or sock fields. * The macro is called with two registers, dst_reg which contains a pointer * to ctx (context) and src_reg which contains the value that should be * stored. However, we need an additional register since we cannot overwrite * dst_reg because it may be used later in the program. * Instead we "borrow" one of the other register. We first save its value * into a new (temp) field in bpf_sock_ops_kern, use it, and then restore * it at the end of the macro. */ #define SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ do { \ int reg = BPF_REG_9; \ BUILD_BUG_ON(sizeof_field(OBJ, OBJ_FIELD) > \ sizeof_field(struct bpf_sock_ops, BPF_FIELD)); \ if (si->dst_reg == reg || si->src_reg == reg) \ reg--; \ if (si->dst_reg == reg || si->src_reg == reg) \ reg--; \ *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, reg, \ offsetof(struct bpf_sock_ops_kern, \ temp)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, \ is_locked_tcp_sock), \ reg, si->dst_reg, \ offsetof(struct bpf_sock_ops_kern, \ is_locked_tcp_sock)); \ *insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, sk),\ reg, si->dst_reg, \ offsetof(struct bpf_sock_ops_kern, sk));\ *insn++ = BPF_RAW_INSN(BPF_FIELD_SIZEOF(OBJ, OBJ_FIELD) | \ BPF_MEM | BPF_CLASS(si->code), \ reg, si->src_reg, \ offsetof(OBJ, OBJ_FIELD), \ si->imm); \ *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->dst_reg, \ offsetof(struct bpf_sock_ops_kern, \ temp)); \ } while (0) #define SOCK_OPS_GET_OR_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ, TYPE) \ do { \ if (TYPE == BPF_WRITE) \ SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ else \ SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ } while (0) switch (si->off) { case offsetof(struct bpf_sock_ops, op): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, op), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, op)); break; case offsetof(struct bpf_sock_ops, replylong[0]) ... offsetof(struct bpf_sock_ops, replylong[3]): BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, reply) != sizeof_field(struct bpf_sock_ops_kern, reply)); BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, replylong) != sizeof_field(struct bpf_sock_ops_kern, replylong)); off = si->off; off -= offsetof(struct bpf_sock_ops, replylong[0]); off += offsetof(struct bpf_sock_ops_kern, replylong[0]); if (type == BPF_WRITE) *insn++ = BPF_EMIT_STORE(BPF_W, si, off); else *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, off); break; case offsetof(struct bpf_sock_ops, family): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_family)); break; case offsetof(struct bpf_sock_ops, remote_ip4): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_daddr)); break; case offsetof(struct bpf_sock_ops, local_ip4): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_rcv_saddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_rcv_saddr)); break; case offsetof(struct bpf_sock_ops, remote_ip6[0]) ... offsetof(struct bpf_sock_ops, remote_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) BUILD_BUG_ON(sizeof_field(struct sock_common, skc_v6_daddr.s6_addr32[0]) != 4); off = si->off; off -= offsetof(struct bpf_sock_ops, remote_ip6[0]); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_v6_daddr.s6_addr32[0]) + off); #else *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); #endif break; case offsetof(struct bpf_sock_ops, local_ip6[0]) ... offsetof(struct bpf_sock_ops, local_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) BUILD_BUG_ON(sizeof_field(struct sock_common, skc_v6_rcv_saddr.s6_addr32[0]) != 4); off = si->off; off -= offsetof(struct bpf_sock_ops, local_ip6[0]); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_v6_rcv_saddr.s6_addr32[0]) + off); #else *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); #endif break; case offsetof(struct bpf_sock_ops, remote_port): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_dport)); #ifndef __BIG_ENDIAN_BITFIELD *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16); #endif break; case offsetof(struct bpf_sock_ops, local_port): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_num)); break; case offsetof(struct bpf_sock_ops, is_fullsock): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, is_fullsock), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, is_fullsock)); break; case offsetof(struct bpf_sock_ops, state): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_state) != 1); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_state)); break; case offsetof(struct bpf_sock_ops, rtt_min): BUILD_BUG_ON(sizeof_field(struct tcp_sock, rtt_min) != sizeof(struct minmax)); BUILD_BUG_ON(sizeof(struct minmax) < sizeof(struct minmax_sample)); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct tcp_sock, rtt_min) + sizeof_field(struct minmax_sample, t)); break; case offsetof(struct bpf_sock_ops, bpf_sock_ops_cb_flags): SOCK_OPS_GET_FIELD(bpf_sock_ops_cb_flags, bpf_sock_ops_cb_flags, struct tcp_sock); break; case offsetof(struct bpf_sock_ops, sk_txhash): SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash, struct sock, type); break; case offsetof(struct bpf_sock_ops, snd_cwnd): SOCK_OPS_GET_TCP_SOCK_FIELD(snd_cwnd); break; case offsetof(struct bpf_sock_ops, srtt_us): SOCK_OPS_GET_TCP_SOCK_FIELD(srtt_us); break; case offsetof(struct bpf_sock_ops, snd_ssthresh): SOCK_OPS_GET_TCP_SOCK_FIELD(snd_ssthresh); break; case offsetof(struct bpf_sock_ops, rcv_nxt): SOCK_OPS_GET_TCP_SOCK_FIELD(rcv_nxt); break; case offsetof(struct bpf_sock_ops, snd_nxt): SOCK_OPS_GET_TCP_SOCK_FIELD(snd_nxt); break; case offsetof(struct bpf_sock_ops, snd_una): SOCK_OPS_GET_TCP_SOCK_FIELD(snd_una); break; case offsetof(struct bpf_sock_ops, mss_cache): SOCK_OPS_GET_TCP_SOCK_FIELD(mss_cache); break; case offsetof(struct bpf_sock_ops, ecn_flags): SOCK_OPS_GET_TCP_SOCK_FIELD(ecn_flags); break; case offsetof(struct bpf_sock_ops, rate_delivered): SOCK_OPS_GET_TCP_SOCK_FIELD(rate_delivered); break; case offsetof(struct bpf_sock_ops, rate_interval_us): SOCK_OPS_GET_TCP_SOCK_FIELD(rate_interval_us); break; case offsetof(struct bpf_sock_ops, packets_out): SOCK_OPS_GET_TCP_SOCK_FIELD(packets_out); break; case offsetof(struct bpf_sock_ops, retrans_out): SOCK_OPS_GET_TCP_SOCK_FIELD(retrans_out); break; case offsetof(struct bpf_sock_ops, total_retrans): SOCK_OPS_GET_TCP_SOCK_FIELD(total_retrans); break; case offsetof(struct bpf_sock_ops, segs_in): SOCK_OPS_GET_TCP_SOCK_FIELD(segs_in); break; case offsetof(struct bpf_sock_ops, data_segs_in): SOCK_OPS_GET_TCP_SOCK_FIELD(data_segs_in); break; case offsetof(struct bpf_sock_ops, segs_out): SOCK_OPS_GET_TCP_SOCK_FIELD(segs_out); break; case offsetof(struct bpf_sock_ops, data_segs_out): SOCK_OPS_GET_TCP_SOCK_FIELD(data_segs_out); break; case offsetof(struct bpf_sock_ops, lost_out): SOCK_OPS_GET_TCP_SOCK_FIELD(lost_out); break; case offsetof(struct bpf_sock_ops, sacked_out): SOCK_OPS_GET_TCP_SOCK_FIELD(sacked_out); break; case offsetof(struct bpf_sock_ops, bytes_received): SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_received); break; case offsetof(struct bpf_sock_ops, bytes_acked): SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_acked); break; case offsetof(struct bpf_sock_ops, sk): SOCK_OPS_GET_SK(); break; case offsetof(struct bpf_sock_ops, skb_data_end): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, skb_data_end), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, skb_data_end)); break; case offsetof(struct bpf_sock_ops, skb_data): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, skb), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, skb)); *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), si->dst_reg, si->dst_reg, offsetof(struct sk_buff, data)); break; case offsetof(struct bpf_sock_ops, skb_len): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, skb), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, skb)); *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len), si->dst_reg, si->dst_reg, offsetof(struct sk_buff, len)); break; case offsetof(struct bpf_sock_ops, skb_tcp_flags): off = offsetof(struct sk_buff, cb); off += offsetof(struct tcp_skb_cb, tcp_flags); *target_size = sizeof_field(struct tcp_skb_cb, tcp_flags); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, skb), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, skb)); *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_skb_cb, tcp_flags), si->dst_reg, si->dst_reg, off); break; case offsetof(struct bpf_sock_ops, skb_hwtstamp): { struct bpf_insn *jmp_on_null_skb; *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, skb), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, skb)); /* Reserve one insn to test skb == NULL */ jmp_on_null_skb = insn++; insn = bpf_convert_shinfo_access(si->dst_reg, si->dst_reg, insn); *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg, bpf_target_off(struct skb_shared_info, hwtstamps, 8, target_size)); *jmp_on_null_skb = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, insn - jmp_on_null_skb - 1); break; } } return insn - insn_buf; } /* data_end = skb->data + skb_headlen() */ static struct bpf_insn *bpf_convert_data_end_access(const struct bpf_insn *si, struct bpf_insn *insn) { int reg; int temp_reg_off = offsetof(struct sk_buff, cb) + offsetof(struct sk_skb_cb, temp_reg); if (si->src_reg == si->dst_reg) { /* We need an extra register, choose and save a register. */ reg = BPF_REG_9; if (si->src_reg == reg || si->dst_reg == reg) reg--; if (si->src_reg == reg || si->dst_reg == reg) reg--; *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg, temp_reg_off); } else { reg = si->dst_reg; } /* reg = skb->data */ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), reg, si->src_reg, offsetof(struct sk_buff, data)); /* AX = skb->len */ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len), BPF_REG_AX, si->src_reg, offsetof(struct sk_buff, len)); /* reg = skb->data + skb->len */ *insn++ = BPF_ALU64_REG(BPF_ADD, reg, BPF_REG_AX); /* AX = skb->data_len */ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data_len), BPF_REG_AX, si->src_reg, offsetof(struct sk_buff, data_len)); /* reg = skb->data + skb->len - skb->data_len */ *insn++ = BPF_ALU64_REG(BPF_SUB, reg, BPF_REG_AX); if (si->src_reg == si->dst_reg) { /* Restore the saved register */ *insn++ = BPF_MOV64_REG(BPF_REG_AX, si->src_reg); *insn++ = BPF_MOV64_REG(si->dst_reg, reg); *insn++ = BPF_LDX_MEM(BPF_DW, reg, BPF_REG_AX, temp_reg_off); } return insn; } static u32 sk_skb_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; int off; switch (si->off) { case offsetof(struct __sk_buff, data_end): insn = bpf_convert_data_end_access(si, insn); break; case offsetof(struct __sk_buff, cb[0]) ... offsetofend(struct __sk_buff, cb[4]) - 1: BUILD_BUG_ON(sizeof_field(struct sk_skb_cb, data) < 20); BUILD_BUG_ON((offsetof(struct sk_buff, cb) + offsetof(struct sk_skb_cb, data)) % sizeof(__u64)); prog->cb_access = 1; off = si->off; off -= offsetof(struct __sk_buff, cb[0]); off += offsetof(struct sk_buff, cb); off += offsetof(struct sk_skb_cb, data); if (type == BPF_WRITE) *insn++ = BPF_EMIT_STORE(BPF_SIZE(si->code), si, off); else *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg, si->src_reg, off); break; default: return bpf_convert_ctx_access(type, si, insn_buf, prog, target_size); } return insn - insn_buf; } static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; #if IS_ENABLED(CONFIG_IPV6) int off; #endif /* convert ctx uses the fact sg element is first in struct */ BUILD_BUG_ON(offsetof(struct sk_msg, sg) != 0); switch (si->off) { case offsetof(struct sk_msg_md, data): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data), si->dst_reg, si->src_reg, offsetof(struct sk_msg, data)); break; case offsetof(struct sk_msg_md, data_end): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data_end), si->dst_reg, si->src_reg, offsetof(struct sk_msg, data_end)); break; case offsetof(struct sk_msg_md, family): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct sk_msg, sk), si->dst_reg, si->src_reg, offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_family)); break; case offsetof(struct sk_msg_md, remote_ip4): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct sk_msg, sk), si->dst_reg, si->src_reg, offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_daddr)); break; case offsetof(struct sk_msg_md, local_ip4): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_rcv_saddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct sk_msg, sk), si->dst_reg, si->src_reg, offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_rcv_saddr)); break; case offsetof(struct sk_msg_md, remote_ip6[0]) ... offsetof(struct sk_msg_md, remote_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) BUILD_BUG_ON(sizeof_field(struct sock_common, skc_v6_daddr.s6_addr32[0]) != 4); off = si->off; off -= offsetof(struct sk_msg_md, remote_ip6[0]); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct sk_msg, sk), si->dst_reg, si->src_reg, offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_v6_daddr.s6_addr32[0]) + off); #else *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); #endif break; case offsetof(struct sk_msg_md, local_ip6[0]) ... offsetof(struct sk_msg_md, local_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) BUILD_BUG_ON(sizeof_field(struct sock_common, skc_v6_rcv_saddr.s6_addr32[0]) != 4); off = si->off; off -= offsetof(struct sk_msg_md, local_ip6[0]); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct sk_msg, sk), si->dst_reg, si->src_reg, offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_v6_rcv_saddr.s6_addr32[0]) + off); #else *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); #endif break; case offsetof(struct sk_msg_md, remote_port): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct sk_msg, sk), si->dst_reg, si->src_reg, offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_dport)); #ifndef __BIG_ENDIAN_BITFIELD *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16); #endif break; case offsetof(struct sk_msg_md, local_port): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct sk_msg, sk), si->dst_reg, si->src_reg, offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_num)); break; case offsetof(struct sk_msg_md, size): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_sg, size), si->dst_reg, si->src_reg, offsetof(struct sk_msg_sg, size)); break; case offsetof(struct sk_msg_md, sk): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, sk), si->dst_reg, si->src_reg, offsetof(struct sk_msg, sk)); break; } return insn - insn_buf; } const struct bpf_verifier_ops sk_filter_verifier_ops = { .get_func_proto = sk_filter_func_proto, .is_valid_access = sk_filter_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, .gen_ld_abs = bpf_gen_ld_abs, }; const struct bpf_prog_ops sk_filter_prog_ops = { .test_run = bpf_prog_test_run_skb, }; const struct bpf_verifier_ops tc_cls_act_verifier_ops = { .get_func_proto = tc_cls_act_func_proto, .is_valid_access = tc_cls_act_is_valid_access, .convert_ctx_access = tc_cls_act_convert_ctx_access, .gen_prologue = tc_cls_act_prologue, .gen_ld_abs = bpf_gen_ld_abs, .btf_struct_access = tc_cls_act_btf_struct_access, }; const struct bpf_prog_ops tc_cls_act_prog_ops = { .test_run = bpf_prog_test_run_skb, }; const struct bpf_verifier_ops xdp_verifier_ops = { .get_func_proto = xdp_func_proto, .is_valid_access = xdp_is_valid_access, .convert_ctx_access = xdp_convert_ctx_access, .gen_prologue = bpf_noop_prologue, .btf_struct_access = xdp_btf_struct_access, }; const struct bpf_prog_ops xdp_prog_ops = { .test_run = bpf_prog_test_run_xdp, }; const struct bpf_verifier_ops cg_skb_verifier_ops = { .get_func_proto = cg_skb_func_proto, .is_valid_access = cg_skb_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, }; const struct bpf_prog_ops cg_skb_prog_ops = { .test_run = bpf_prog_test_run_skb, }; const struct bpf_verifier_ops lwt_in_verifier_ops = { .get_func_proto = lwt_in_func_proto, .is_valid_access = lwt_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, }; const struct bpf_prog_ops lwt_in_prog_ops = { .test_run = bpf_prog_test_run_skb, }; const struct bpf_verifier_ops lwt_out_verifier_ops = { .get_func_proto = lwt_out_func_proto, .is_valid_access = lwt_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, }; const struct bpf_prog_ops lwt_out_prog_ops = { .test_run = bpf_prog_test_run_skb, }; const struct bpf_verifier_ops lwt_xmit_verifier_ops = { .get_func_proto = lwt_xmit_func_proto, .is_valid_access = lwt_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, .gen_prologue = tc_cls_act_prologue, }; const struct bpf_prog_ops lwt_xmit_prog_ops = { .test_run = bpf_prog_test_run_skb, }; const struct bpf_verifier_ops lwt_seg6local_verifier_ops = { .get_func_proto = lwt_seg6local_func_proto, .is_valid_access = lwt_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, }; const struct bpf_prog_ops lwt_seg6local_prog_ops = { }; const struct bpf_verifier_ops cg_sock_verifier_ops = { .get_func_proto = sock_filter_func_proto, .is_valid_access = sock_filter_is_valid_access, .convert_ctx_access = bpf_sock_convert_ctx_access, }; const struct bpf_prog_ops cg_sock_prog_ops = { }; const struct bpf_verifier_ops cg_sock_addr_verifier_ops = { .get_func_proto = sock_addr_func_proto, .is_valid_access = sock_addr_is_valid_access, .convert_ctx_access = sock_addr_convert_ctx_access, }; const struct bpf_prog_ops cg_sock_addr_prog_ops = { }; const struct bpf_verifier_ops sock_ops_verifier_ops = { .get_func_proto = sock_ops_func_proto, .is_valid_access = sock_ops_is_valid_access, .convert_ctx_access = sock_ops_convert_ctx_access, }; const struct bpf_prog_ops sock_ops_prog_ops = { }; const struct bpf_verifier_ops sk_skb_verifier_ops = { .get_func_proto = sk_skb_func_proto, .is_valid_access = sk_skb_is_valid_access, .convert_ctx_access = sk_skb_convert_ctx_access, .gen_prologue = sk_skb_prologue, }; const struct bpf_prog_ops sk_skb_prog_ops = { }; const struct bpf_verifier_ops sk_msg_verifier_ops = { .get_func_proto = sk_msg_func_proto, .is_valid_access = sk_msg_is_valid_access, .convert_ctx_access = sk_msg_convert_ctx_access, .gen_prologue = bpf_noop_prologue, }; const struct bpf_prog_ops sk_msg_prog_ops = { }; const struct bpf_verifier_ops flow_dissector_verifier_ops = { .get_func_proto = flow_dissector_func_proto, .is_valid_access = flow_dissector_is_valid_access, .convert_ctx_access = flow_dissector_convert_ctx_access, }; const struct bpf_prog_ops flow_dissector_prog_ops = { .test_run = bpf_prog_test_run_flow_dissector, }; int sk_detach_filter(struct sock *sk) { int ret = -ENOENT; struct sk_filter *filter; if (sock_flag(sk, SOCK_FILTER_LOCKED)) return -EPERM; filter = rcu_dereference_protected(sk->sk_filter, lockdep_sock_is_held(sk)); if (filter) { RCU_INIT_POINTER(sk->sk_filter, NULL); sk_filter_uncharge(sk, filter); ret = 0; } return ret; } EXPORT_SYMBOL_GPL(sk_detach_filter); int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len) { struct sock_fprog_kern *fprog; struct sk_filter *filter; int ret = 0; sockopt_lock_sock(sk); filter = rcu_dereference_protected(sk->sk_filter, lockdep_sock_is_held(sk)); if (!filter) goto out; /* We're copying the filter that has been originally attached, * so no conversion/decode needed anymore. eBPF programs that * have no original program cannot be dumped through this. */ ret = -EACCES; fprog = filter->prog->orig_prog; if (!fprog) goto out; ret = fprog->len; if (!len) /* User space only enquires number of filter blocks. */ goto out; ret = -EINVAL; if (len < fprog->len) goto out; ret = -EFAULT; if (copy_to_sockptr(optval, fprog->filter, bpf_classic_proglen(fprog))) goto out; /* Instead of bytes, the API requests to return the number * of filter blocks. */ ret = fprog->len; out: sockopt_release_sock(sk); return ret; } #ifdef CONFIG_INET static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern, struct sock_reuseport *reuse, struct sock *sk, struct sk_buff *skb, struct sock *migrating_sk, u32 hash) { reuse_kern->skb = skb; reuse_kern->sk = sk; reuse_kern->selected_sk = NULL; reuse_kern->migrating_sk = migrating_sk; reuse_kern->data_end = skb->data + skb_headlen(skb); reuse_kern->hash = hash; reuse_kern->reuseport_id = reuse->reuseport_id; reuse_kern->bind_inany = reuse->bind_inany; } struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, struct bpf_prog *prog, struct sk_buff *skb, struct sock *migrating_sk, u32 hash) { struct sk_reuseport_kern reuse_kern; enum sk_action action; bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, migrating_sk, hash); action = bpf_prog_run(prog, &reuse_kern); if (action == SK_PASS) return reuse_kern.selected_sk; else return ERR_PTR(-ECONNREFUSED); } BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern, struct bpf_map *, map, void *, key, u32, flags) { bool is_sockarray = map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY; struct sock_reuseport *reuse; struct sock *selected_sk; int err; selected_sk = map->ops->map_lookup_elem(map, key); if (!selected_sk) return -ENOENT; reuse = rcu_dereference(selected_sk->sk_reuseport_cb); if (!reuse) { /* reuseport_array has only sk with non NULL sk_reuseport_cb. * The only (!reuse) case here is - the sk has already been * unhashed (e.g. by close()), so treat it as -ENOENT. * * Other maps (e.g. sock_map) do not provide this guarantee and * the sk may never be in the reuseport group to begin with. */ err = is_sockarray ? -ENOENT : -EINVAL; goto error; } if (unlikely(reuse->reuseport_id != reuse_kern->reuseport_id)) { struct sock *sk = reuse_kern->sk; if (sk->sk_protocol != selected_sk->sk_protocol) { err = -EPROTOTYPE; } else if (sk->sk_family != selected_sk->sk_family) { err = -EAFNOSUPPORT; } else { /* Catch all. Likely bound to a different sockaddr. */ err = -EBADFD; } goto error; } reuse_kern->selected_sk = selected_sk; return 0; error: /* Lookup in sock_map can return TCP ESTABLISHED sockets. */ if (sk_is_refcounted(selected_sk)) sock_put(selected_sk); return err; } static const struct bpf_func_proto sk_select_reuseport_proto = { .func = sk_select_reuseport, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_PTR_TO_MAP_KEY, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(sk_reuseport_load_bytes, const struct sk_reuseport_kern *, reuse_kern, u32, offset, void *, to, u32, len) { return ____bpf_skb_load_bytes(reuse_kern->skb, offset, to, len); } static const struct bpf_func_proto sk_reuseport_load_bytes_proto = { .func = sk_reuseport_load_bytes, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, }; BPF_CALL_5(sk_reuseport_load_bytes_relative, const struct sk_reuseport_kern *, reuse_kern, u32, offset, void *, to, u32, len, u32, start_header) { return ____bpf_skb_load_bytes_relative(reuse_kern->skb, offset, to, len, start_header); } static const struct bpf_func_proto sk_reuseport_load_bytes_relative_proto = { .func = sk_reuseport_load_bytes_relative, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, .arg5_type = ARG_ANYTHING, }; static const struct bpf_func_proto * sk_reuseport_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_sk_select_reuseport: return &sk_select_reuseport_proto; case BPF_FUNC_skb_load_bytes: return &sk_reuseport_load_bytes_proto; case BPF_FUNC_skb_load_bytes_relative: return &sk_reuseport_load_bytes_relative_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_ptr_cookie_proto; case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; default: return bpf_base_func_proto(func_id, prog); } } static bool sk_reuseport_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const u32 size_default = sizeof(__u32); if (off < 0 || off >= sizeof(struct sk_reuseport_md) || off % size || type != BPF_READ) return false; switch (off) { case offsetof(struct sk_reuseport_md, data): info->reg_type = PTR_TO_PACKET; return size == sizeof(__u64); case offsetof(struct sk_reuseport_md, data_end): info->reg_type = PTR_TO_PACKET_END; return size == sizeof(__u64); case offsetof(struct sk_reuseport_md, hash): return size == size_default; case offsetof(struct sk_reuseport_md, sk): info->reg_type = PTR_TO_SOCKET; return size == sizeof(__u64); case offsetof(struct sk_reuseport_md, migrating_sk): info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL; return size == sizeof(__u64); /* Fields that allow narrowing */ case bpf_ctx_range(struct sk_reuseport_md, eth_protocol): if (size < sizeof_field(struct sk_buff, protocol)) return false; fallthrough; case bpf_ctx_range(struct sk_reuseport_md, ip_protocol): case bpf_ctx_range(struct sk_reuseport_md, bind_inany): case bpf_ctx_range(struct sk_reuseport_md, len): bpf_ctx_record_field_size(info, size_default); return bpf_ctx_narrow_access_ok(off, size, size_default); default: return false; } } #define SK_REUSEPORT_LOAD_FIELD(F) ({ \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_reuseport_kern, F), \ si->dst_reg, si->src_reg, \ bpf_target_off(struct sk_reuseport_kern, F, \ sizeof_field(struct sk_reuseport_kern, F), \ target_size)); \ }) #define SK_REUSEPORT_LOAD_SKB_FIELD(SKB_FIELD) \ SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern, \ struct sk_buff, \ skb, \ SKB_FIELD) #define SK_REUSEPORT_LOAD_SK_FIELD(SK_FIELD) \ SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern, \ struct sock, \ sk, \ SK_FIELD) static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; switch (si->off) { case offsetof(struct sk_reuseport_md, data): SK_REUSEPORT_LOAD_SKB_FIELD(data); break; case offsetof(struct sk_reuseport_md, len): SK_REUSEPORT_LOAD_SKB_FIELD(len); break; case offsetof(struct sk_reuseport_md, eth_protocol): SK_REUSEPORT_LOAD_SKB_FIELD(protocol); break; case offsetof(struct sk_reuseport_md, ip_protocol): SK_REUSEPORT_LOAD_SK_FIELD(sk_protocol); break; case offsetof(struct sk_reuseport_md, data_end): SK_REUSEPORT_LOAD_FIELD(data_end); break; case offsetof(struct sk_reuseport_md, hash): SK_REUSEPORT_LOAD_FIELD(hash); break; case offsetof(struct sk_reuseport_md, bind_inany): SK_REUSEPORT_LOAD_FIELD(bind_inany); break; case offsetof(struct sk_reuseport_md, sk): SK_REUSEPORT_LOAD_FIELD(sk); break; case offsetof(struct sk_reuseport_md, migrating_sk): SK_REUSEPORT_LOAD_FIELD(migrating_sk); break; } return insn - insn_buf; } const struct bpf_verifier_ops sk_reuseport_verifier_ops = { .get_func_proto = sk_reuseport_func_proto, .is_valid_access = sk_reuseport_is_valid_access, .convert_ctx_access = sk_reuseport_convert_ctx_access, }; const struct bpf_prog_ops sk_reuseport_prog_ops = { }; DEFINE_STATIC_KEY_FALSE(bpf_sk_lookup_enabled); EXPORT_SYMBOL(bpf_sk_lookup_enabled); BPF_CALL_3(bpf_sk_lookup_assign, struct bpf_sk_lookup_kern *, ctx, struct sock *, sk, u64, flags) { if (unlikely(flags & ~(BPF_SK_LOOKUP_F_REPLACE | BPF_SK_LOOKUP_F_NO_REUSEPORT))) return -EINVAL; if (unlikely(sk && sk_is_refcounted(sk))) return -ESOCKTNOSUPPORT; /* reject non-RCU freed sockets */ if (unlikely(sk && sk_is_tcp(sk) && sk->sk_state != TCP_LISTEN)) return -ESOCKTNOSUPPORT; /* only accept TCP socket in LISTEN */ if (unlikely(sk && sk_is_udp(sk) && sk->sk_state != TCP_CLOSE)) return -ESOCKTNOSUPPORT; /* only accept UDP socket in CLOSE */ /* Check if socket is suitable for packet L3/L4 protocol */ if (sk && sk->sk_protocol != ctx->protocol) return -EPROTOTYPE; if (sk && sk->sk_family != ctx->family && (sk->sk_family == AF_INET || ipv6_only_sock(sk))) return -EAFNOSUPPORT; if (ctx->selected_sk && !(flags & BPF_SK_LOOKUP_F_REPLACE)) return -EEXIST; /* Select socket as lookup result */ ctx->selected_sk = sk; ctx->no_reuseport = flags & BPF_SK_LOOKUP_F_NO_REUSEPORT; return 0; } static const struct bpf_func_proto bpf_sk_lookup_assign_proto = { .func = bpf_sk_lookup_assign, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_SOCKET_OR_NULL, .arg3_type = ARG_ANYTHING, }; static const struct bpf_func_proto * sk_lookup_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; case BPF_FUNC_sk_assign: return &bpf_sk_lookup_assign_proto; case BPF_FUNC_sk_release: return &bpf_sk_release_proto; default: return bpf_sk_base_func_proto(func_id, prog); } } static bool sk_lookup_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (off < 0 || off >= sizeof(struct bpf_sk_lookup)) return false; if (off % size != 0) return false; if (type != BPF_READ) return false; switch (off) { case bpf_ctx_range_ptr(struct bpf_sk_lookup, sk): info->reg_type = PTR_TO_SOCKET_OR_NULL; return size == sizeof(__u64); case bpf_ctx_range(struct bpf_sk_lookup, family): case bpf_ctx_range(struct bpf_sk_lookup, protocol): case bpf_ctx_range(struct bpf_sk_lookup, remote_ip4): case bpf_ctx_range(struct bpf_sk_lookup, local_ip4): case bpf_ctx_range_till(struct bpf_sk_lookup, remote_ip6[0], remote_ip6[3]): case bpf_ctx_range_till(struct bpf_sk_lookup, local_ip6[0], local_ip6[3]): case bpf_ctx_range(struct bpf_sk_lookup, local_port): case bpf_ctx_range(struct bpf_sk_lookup, ingress_ifindex): bpf_ctx_record_field_size(info, sizeof(__u32)); return bpf_ctx_narrow_access_ok(off, size, sizeof(__u32)); case bpf_ctx_range(struct bpf_sk_lookup, remote_port): /* Allow 4-byte access to 2-byte field for backward compatibility */ if (size == sizeof(__u32)) return true; bpf_ctx_record_field_size(info, sizeof(__be16)); return bpf_ctx_narrow_access_ok(off, size, sizeof(__be16)); case offsetofend(struct bpf_sk_lookup, remote_port) ... offsetof(struct bpf_sk_lookup, local_ip4) - 1: /* Allow access to zero padding for backward compatibility */ bpf_ctx_record_field_size(info, sizeof(__u16)); return bpf_ctx_narrow_access_ok(off, size, sizeof(__u16)); default: return false; } } static u32 sk_lookup_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; switch (si->off) { case offsetof(struct bpf_sk_lookup, sk): *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, offsetof(struct bpf_sk_lookup_kern, selected_sk)); break; case offsetof(struct bpf_sk_lookup, family): *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct bpf_sk_lookup_kern, family, 2, target_size)); break; case offsetof(struct bpf_sk_lookup, protocol): *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct bpf_sk_lookup_kern, protocol, 2, target_size)); break; case offsetof(struct bpf_sk_lookup, remote_ip4): *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct bpf_sk_lookup_kern, v4.saddr, 4, target_size)); break; case offsetof(struct bpf_sk_lookup, local_ip4): *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct bpf_sk_lookup_kern, v4.daddr, 4, target_size)); break; case bpf_ctx_range_till(struct bpf_sk_lookup, remote_ip6[0], remote_ip6[3]): { #if IS_ENABLED(CONFIG_IPV6) int off = si->off; off -= offsetof(struct bpf_sk_lookup, remote_ip6[0]); off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size); *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, offsetof(struct bpf_sk_lookup_kern, v6.saddr)); *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off); #else *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); #endif break; } case bpf_ctx_range_till(struct bpf_sk_lookup, local_ip6[0], local_ip6[3]): { #if IS_ENABLED(CONFIG_IPV6) int off = si->off; off -= offsetof(struct bpf_sk_lookup, local_ip6[0]); off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size); *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, offsetof(struct bpf_sk_lookup_kern, v6.daddr)); *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off); #else *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); #endif break; } case offsetof(struct bpf_sk_lookup, remote_port): *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct bpf_sk_lookup_kern, sport, 2, target_size)); break; case offsetofend(struct bpf_sk_lookup, remote_port): *target_size = 2; *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); break; case offsetof(struct bpf_sk_lookup, local_port): *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct bpf_sk_lookup_kern, dport, 2, target_size)); break; case offsetof(struct bpf_sk_lookup, ingress_ifindex): *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct bpf_sk_lookup_kern, ingress_ifindex, 4, target_size)); break; } return insn - insn_buf; } const struct bpf_prog_ops sk_lookup_prog_ops = { .test_run = bpf_prog_test_run_sk_lookup, }; const struct bpf_verifier_ops sk_lookup_verifier_ops = { .get_func_proto = sk_lookup_func_proto, .is_valid_access = sk_lookup_is_valid_access, .convert_ctx_access = sk_lookup_convert_ctx_access, }; #endif /* CONFIG_INET */ DEFINE_BPF_DISPATCHER(xdp) void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog) { bpf_dispatcher_change_prog(BPF_DISPATCHER_PTR(xdp), prev_prog, prog); } BTF_ID_LIST_GLOBAL(btf_sock_ids, MAX_BTF_SOCK_TYPE) #define BTF_SOCK_TYPE(name, type) BTF_ID(struct, type) BTF_SOCK_TYPE_xxx #undef BTF_SOCK_TYPE BPF_CALL_1(bpf_skc_to_tcp6_sock, struct sock *, sk) { /* tcp6_sock type is not generated in dwarf and hence btf, * trigger an explicit type generation here. */ BTF_TYPE_EMIT(struct tcp6_sock); if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP && sk->sk_family == AF_INET6) return (unsigned long)sk; return (unsigned long)NULL; } const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto = { .func = bpf_skc_to_tcp6_sock, .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP6], }; BPF_CALL_1(bpf_skc_to_tcp_sock, struct sock *, sk) { if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP) return (unsigned long)sk; return (unsigned long)NULL; } const struct bpf_func_proto bpf_skc_to_tcp_sock_proto = { .func = bpf_skc_to_tcp_sock, .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP], }; BPF_CALL_1(bpf_skc_to_tcp_timewait_sock, struct sock *, sk) { /* BTF types for tcp_timewait_sock and inet_timewait_sock are not * generated if CONFIG_INET=n. Trigger an explicit generation here. */ BTF_TYPE_EMIT(struct inet_timewait_sock); BTF_TYPE_EMIT(struct tcp_timewait_sock); #ifdef CONFIG_INET if (sk && sk->sk_prot == &tcp_prot && sk->sk_state == TCP_TIME_WAIT) return (unsigned long)sk; #endif #if IS_BUILTIN(CONFIG_IPV6) if (sk && sk->sk_prot == &tcpv6_prot && sk->sk_state == TCP_TIME_WAIT) return (unsigned long)sk; #endif return (unsigned long)NULL; } const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto = { .func = bpf_skc_to_tcp_timewait_sock, .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP_TW], }; BPF_CALL_1(bpf_skc_to_tcp_request_sock, struct sock *, sk) { #ifdef CONFIG_INET if (sk && sk->sk_prot == &tcp_prot && sk->sk_state == TCP_NEW_SYN_RECV) return (unsigned long)sk; #endif #if IS_BUILTIN(CONFIG_IPV6) if (sk && sk->sk_prot == &tcpv6_prot && sk->sk_state == TCP_NEW_SYN_RECV) return (unsigned long)sk; #endif return (unsigned long)NULL; } const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto = { .func = bpf_skc_to_tcp_request_sock, .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP_REQ], }; BPF_CALL_1(bpf_skc_to_udp6_sock, struct sock *, sk) { /* udp6_sock type is not generated in dwarf and hence btf, * trigger an explicit type generation here. */ BTF_TYPE_EMIT(struct udp6_sock); if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_UDP && sk->sk_type == SOCK_DGRAM && sk->sk_family == AF_INET6) return (unsigned long)sk; return (unsigned long)NULL; } const struct bpf_func_proto bpf_skc_to_udp6_sock_proto = { .func = bpf_skc_to_udp6_sock, .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_UDP6], }; BPF_CALL_1(bpf_skc_to_unix_sock, struct sock *, sk) { /* unix_sock type is not generated in dwarf and hence btf, * trigger an explicit type generation here. */ BTF_TYPE_EMIT(struct unix_sock); if (sk && sk_fullsock(sk) && sk->sk_family == AF_UNIX) return (unsigned long)sk; return (unsigned long)NULL; } const struct bpf_func_proto bpf_skc_to_unix_sock_proto = { .func = bpf_skc_to_unix_sock, .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_UNIX], }; BPF_CALL_1(bpf_skc_to_mptcp_sock, struct sock *, sk) { BTF_TYPE_EMIT(struct mptcp_sock); return (unsigned long)bpf_mptcp_sock_from_subflow(sk); } const struct bpf_func_proto bpf_skc_to_mptcp_sock_proto = { .func = bpf_skc_to_mptcp_sock, .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .arg1_type = ARG_PTR_TO_SOCK_COMMON, .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_MPTCP], }; BPF_CALL_1(bpf_sock_from_file, struct file *, file) { return (unsigned long)sock_from_file(file); } BTF_ID_LIST(bpf_sock_from_file_btf_ids) BTF_ID(struct, socket) BTF_ID(struct, file) const struct bpf_func_proto bpf_sock_from_file_proto = { .func = bpf_sock_from_file, .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .ret_btf_id = &bpf_sock_from_file_btf_ids[0], .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &bpf_sock_from_file_btf_ids[1], }; static const struct bpf_func_proto * bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { const struct bpf_func_proto *func; switch (func_id) { case BPF_FUNC_skc_to_tcp6_sock: func = &bpf_skc_to_tcp6_sock_proto; break; case BPF_FUNC_skc_to_tcp_sock: func = &bpf_skc_to_tcp_sock_proto; break; case BPF_FUNC_skc_to_tcp_timewait_sock: func = &bpf_skc_to_tcp_timewait_sock_proto; break; case BPF_FUNC_skc_to_tcp_request_sock: func = &bpf_skc_to_tcp_request_sock_proto; break; case BPF_FUNC_skc_to_udp6_sock: func = &bpf_skc_to_udp6_sock_proto; break; case BPF_FUNC_skc_to_unix_sock: func = &bpf_skc_to_unix_sock_proto; break; case BPF_FUNC_skc_to_mptcp_sock: func = &bpf_skc_to_mptcp_sock_proto; break; case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; default: return bpf_base_func_proto(func_id, prog); } if (!bpf_token_capable(prog->aux->token, CAP_PERFMON)) return NULL; return func; } /** * bpf_skb_meta_pointer() - Gets a mutable pointer within the skb metadata area. * @skb: socket buffer carrying the metadata * @offset: offset into the metadata area, must be <= skb_metadata_len() */ void *bpf_skb_meta_pointer(struct sk_buff *skb, u32 offset) { return skb_metadata_end(skb) - skb_metadata_len(skb) + offset; } int __bpf_skb_meta_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags) { if (unlikely(flags)) return -EINVAL; if (unlikely(bpf_try_make_writable(skb, 0))) return -EFAULT; memmove(bpf_skb_meta_pointer(skb, offset), from, len); return 0; } __bpf_kfunc_start_defs(); __bpf_kfunc int bpf_dynptr_from_skb(struct __sk_buff *s, u64 flags, struct bpf_dynptr *ptr__uninit) { struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit; struct sk_buff *skb = (struct sk_buff *)s; if (flags) { bpf_dynptr_set_null(ptr); return -EINVAL; } bpf_dynptr_init(ptr, skb, BPF_DYNPTR_TYPE_SKB, 0, skb->len); return 0; } /** * bpf_dynptr_from_skb_meta() - Initialize a dynptr to the skb metadata area. * @skb_: socket buffer carrying the metadata * @flags: future use, must be zero * @ptr__uninit: dynptr to initialize * * Set up a dynptr for access to the metadata area earlier allocated from the * XDP context with bpf_xdp_adjust_meta(). Serves as an alternative to * &__sk_buff->data_meta. * * Return: * * %0 - dynptr ready to use * * %-EINVAL - invalid flags, dynptr set to null */ __bpf_kfunc int bpf_dynptr_from_skb_meta(struct __sk_buff *skb_, u64 flags, struct bpf_dynptr *ptr__uninit) { struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit; struct sk_buff *skb = (struct sk_buff *)skb_; if (flags) { bpf_dynptr_set_null(ptr); return -EINVAL; } bpf_dynptr_init(ptr, skb, BPF_DYNPTR_TYPE_SKB_META, 0, skb_metadata_len(skb)); return 0; } __bpf_kfunc int bpf_dynptr_from_xdp(struct xdp_md *x, u64 flags, struct bpf_dynptr *ptr__uninit) { struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit; struct xdp_buff *xdp = (struct xdp_buff *)x; if (flags) { bpf_dynptr_set_null(ptr); return -EINVAL; } bpf_dynptr_init(ptr, xdp, BPF_DYNPTR_TYPE_XDP, 0, xdp_get_buff_len(xdp)); return 0; } __bpf_kfunc int bpf_sock_addr_set_sun_path(struct bpf_sock_addr_kern *sa_kern, const u8 *sun_path, u32 sun_path__sz) { struct sockaddr_un *un; if (sa_kern->sk->sk_family != AF_UNIX) return -EINVAL; /* We do not allow changing the address to unnamed or larger than the * maximum allowed address size for a unix sockaddr. */ if (sun_path__sz == 0 || sun_path__sz > UNIX_PATH_MAX) return -EINVAL; un = (struct sockaddr_un *)sa_kern->uaddr; memcpy(un->sun_path, sun_path, sun_path__sz); sa_kern->uaddrlen = offsetof(struct sockaddr_un, sun_path) + sun_path__sz; return 0; } __bpf_kfunc int bpf_sk_assign_tcp_reqsk(struct __sk_buff *s, struct sock *sk, struct bpf_tcp_req_attrs *attrs, int attrs__sz) { #if IS_ENABLED(CONFIG_SYN_COOKIES) struct sk_buff *skb = (struct sk_buff *)s; const struct request_sock_ops *ops; struct inet_request_sock *ireq; struct tcp_request_sock *treq; struct request_sock *req; struct net *net; __u16 min_mss; u32 tsoff = 0; if (attrs__sz != sizeof(*attrs) || attrs->reserved[0] || attrs->reserved[1] || attrs->reserved[2]) return -EINVAL; if (!skb_at_tc_ingress(skb)) return -EINVAL; net = dev_net(skb->dev); if (net != sock_net(sk)) return -ENETUNREACH; switch (skb->protocol) { case htons(ETH_P_IP): ops = &tcp_request_sock_ops; min_mss = 536; break; #if IS_BUILTIN(CONFIG_IPV6) case htons(ETH_P_IPV6): ops = &tcp6_request_sock_ops; min_mss = IPV6_MIN_MTU - 60; break; #endif default: return -EINVAL; } if (sk->sk_type != SOCK_STREAM || sk->sk_state != TCP_LISTEN || sk_is_mptcp(sk)) return -EINVAL; if (attrs->mss < min_mss) return -EINVAL; if (attrs->wscale_ok) { if (!READ_ONCE(net->ipv4.sysctl_tcp_window_scaling)) return -EINVAL; if (attrs->snd_wscale > TCP_MAX_WSCALE || attrs->rcv_wscale > TCP_MAX_WSCALE) return -EINVAL; } if (attrs->sack_ok && !READ_ONCE(net->ipv4.sysctl_tcp_sack)) return -EINVAL; if (attrs->tstamp_ok) { if (!READ_ONCE(net->ipv4.sysctl_tcp_timestamps)) return -EINVAL; tsoff = attrs->rcv_tsecr - tcp_ns_to_ts(attrs->usec_ts_ok, tcp_clock_ns()); } req = inet_reqsk_alloc(ops, sk, false); if (!req) return -ENOMEM; ireq = inet_rsk(req); treq = tcp_rsk(req); req->rsk_listener = sk; req->syncookie = 1; req->mss = attrs->mss; req->ts_recent = attrs->rcv_tsval; ireq->snd_wscale = attrs->snd_wscale; ireq->rcv_wscale = attrs->rcv_wscale; ireq->tstamp_ok = !!attrs->tstamp_ok; ireq->sack_ok = !!attrs->sack_ok; ireq->wscale_ok = !!attrs->wscale_ok; ireq->ecn_ok = !!attrs->ecn_ok; treq->req_usec_ts = !!attrs->usec_ts_ok; treq->ts_off = tsoff; skb_orphan(skb); skb->sk = req_to_sk(req); skb->destructor = sock_pfree; return 0; #else return -EOPNOTSUPP; #endif } __bpf_kfunc int bpf_sock_ops_enable_tx_tstamp(struct bpf_sock_ops_kern *skops, u64 flags) { struct sk_buff *skb; if (skops->op != BPF_SOCK_OPS_TSTAMP_SENDMSG_CB) return -EOPNOTSUPP; if (flags) return -EINVAL; skb = skops->skb; skb_shinfo(skb)->tx_flags |= SKBTX_BPF; TCP_SKB_CB(skb)->txstamp_ack |= TSTAMP_ACK_BPF; skb_shinfo(skb)->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1; return 0; } /** * bpf_xdp_pull_data() - Pull in non-linear xdp data. * @x: &xdp_md associated with the XDP buffer * @len: length of data to be made directly accessible in the linear part * * Pull in data in case the XDP buffer associated with @x is non-linear and * not all @len are in the linear data area. * * Direct packet access allows reading and writing linear XDP data through * packet pointers (i.e., &xdp_md->data + offsets). The amount of data which * ends up in the linear part of the xdp_buff depends on the NIC and its * configuration. When a frag-capable XDP program wants to directly access * headers that may be in the non-linear area, call this kfunc to make sure * the data is available in the linear area. Alternatively, use dynptr or * bpf_xdp_{load,store}_bytes() to access data without pulling. * * This kfunc can also be used with bpf_xdp_adjust_head() to decapsulate * headers in the non-linear data area. * * A call to this kfunc may reduce headroom. If there is not enough tailroom * in the linear data area, metadata and data will be shifted down. * * A call to this kfunc is susceptible to change the buffer geometry. * Therefore, at load time, all checks on pointers previously done by the * verifier are invalidated and must be performed again, if the kfunc is used * in combination with direct packet access. * * Return: * * %0 - success * * %-EINVAL - invalid len */ __bpf_kfunc int bpf_xdp_pull_data(struct xdp_md *x, u32 len) { struct xdp_buff *xdp = (struct xdp_buff *)x; struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); int i, delta, shift, headroom, tailroom, n_frags_free = 0; void *data_hard_end = xdp_data_hard_end(xdp); int data_len = xdp->data_end - xdp->data; void *start; if (len <= data_len) return 0; if (unlikely(len > xdp_get_buff_len(xdp))) return -EINVAL; start = xdp_data_meta_unsupported(xdp) ? xdp->data : xdp->data_meta; headroom = start - xdp->data_hard_start - sizeof(struct xdp_frame); tailroom = data_hard_end - xdp->data_end; delta = len - data_len; if (unlikely(delta > tailroom + headroom)) return -EINVAL; shift = delta - tailroom; if (shift > 0) { memmove(start - shift, start, xdp->data_end - start); xdp->data_meta -= shift; xdp->data -= shift; xdp->data_end -= shift; } for (i = 0; i < sinfo->nr_frags && delta; i++) { skb_frag_t *frag = &sinfo->frags[i]; u32 shrink = min_t(u32, delta, skb_frag_size(frag)); memcpy(xdp->data_end, skb_frag_address(frag), shrink); xdp->data_end += shrink; sinfo->xdp_frags_size -= shrink; delta -= shrink; if (bpf_xdp_shrink_data(xdp, frag, shrink, false)) n_frags_free++; } if (unlikely(n_frags_free)) { memmove(sinfo->frags, sinfo->frags + n_frags_free, (sinfo->nr_frags - n_frags_free) * sizeof(skb_frag_t)); sinfo->nr_frags -= n_frags_free; if (!sinfo->nr_frags) { xdp_buff_clear_frags_flag(xdp); xdp_buff_clear_frag_pfmemalloc(xdp); } } return 0; } __bpf_kfunc_end_defs(); int bpf_dynptr_from_skb_rdonly(struct __sk_buff *skb, u64 flags, struct bpf_dynptr *ptr__uninit) { struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit; int err; err = bpf_dynptr_from_skb(skb, flags, ptr__uninit); if (err) return err; bpf_dynptr_set_rdonly(ptr); return 0; } BTF_KFUNCS_START(bpf_kfunc_check_set_skb) BTF_ID_FLAGS(func, bpf_dynptr_from_skb, KF_TRUSTED_ARGS) BTF_KFUNCS_END(bpf_kfunc_check_set_skb) BTF_KFUNCS_START(bpf_kfunc_check_set_skb_meta) BTF_ID_FLAGS(func, bpf_dynptr_from_skb_meta, KF_TRUSTED_ARGS) BTF_KFUNCS_END(bpf_kfunc_check_set_skb_meta) BTF_KFUNCS_START(bpf_kfunc_check_set_xdp) BTF_ID_FLAGS(func, bpf_dynptr_from_xdp) BTF_ID_FLAGS(func, bpf_xdp_pull_data) BTF_KFUNCS_END(bpf_kfunc_check_set_xdp) BTF_KFUNCS_START(bpf_kfunc_check_set_sock_addr) BTF_ID_FLAGS(func, bpf_sock_addr_set_sun_path) BTF_KFUNCS_END(bpf_kfunc_check_set_sock_addr) BTF_KFUNCS_START(bpf_kfunc_check_set_tcp_reqsk) BTF_ID_FLAGS(func, bpf_sk_assign_tcp_reqsk, KF_TRUSTED_ARGS) BTF_KFUNCS_END(bpf_kfunc_check_set_tcp_reqsk) BTF_KFUNCS_START(bpf_kfunc_check_set_sock_ops) BTF_ID_FLAGS(func, bpf_sock_ops_enable_tx_tstamp, KF_TRUSTED_ARGS) BTF_KFUNCS_END(bpf_kfunc_check_set_sock_ops) static const struct btf_kfunc_id_set bpf_kfunc_set_skb = { .owner = THIS_MODULE, .set = &bpf_kfunc_check_set_skb, }; static const struct btf_kfunc_id_set bpf_kfunc_set_skb_meta = { .owner = THIS_MODULE, .set = &bpf_kfunc_check_set_skb_meta, }; static const struct btf_kfunc_id_set bpf_kfunc_set_xdp = { .owner = THIS_MODULE, .set = &bpf_kfunc_check_set_xdp, }; static const struct btf_kfunc_id_set bpf_kfunc_set_sock_addr = { .owner = THIS_MODULE, .set = &bpf_kfunc_check_set_sock_addr, }; static const struct btf_kfunc_id_set bpf_kfunc_set_tcp_reqsk = { .owner = THIS_MODULE, .set = &bpf_kfunc_check_set_tcp_reqsk, }; static const struct btf_kfunc_id_set bpf_kfunc_set_sock_ops = { .owner = THIS_MODULE, .set = &bpf_kfunc_check_set_sock_ops, }; static int __init bpf_kfunc_init(void) { int ret; ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_ACT, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SK_SKB, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SOCKET_FILTER, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SKB, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_OUT, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_IN, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_XMIT, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_NETFILTER, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_skb_meta); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_ACT, &bpf_kfunc_set_skb_meta); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, &bpf_kfunc_set_sock_addr); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_tcp_reqsk); return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SOCK_OPS, &bpf_kfunc_set_sock_ops); } late_initcall(bpf_kfunc_init); __bpf_kfunc_start_defs(); /* bpf_sock_destroy: Destroy the given socket with ECONNABORTED error code. * * The function expects a non-NULL pointer to a socket, and invokes the * protocol specific socket destroy handlers. * * The helper can only be called from BPF contexts that have acquired the socket * locks. * * Parameters: * @sock: Pointer to socket to be destroyed * * Return: * On error, may return EPROTONOSUPPORT, EINVAL. * EPROTONOSUPPORT if protocol specific destroy handler is not supported. * 0 otherwise */ __bpf_kfunc int bpf_sock_destroy(struct sock_common *sock) { struct sock *sk = (struct sock *)sock; /* The locking semantics that allow for synchronous execution of the * destroy handlers are only supported for TCP and UDP. * Supporting protocols will need to acquire sock lock in the BPF context * prior to invoking this kfunc. */ if (!sk->sk_prot->diag_destroy || (sk->sk_protocol != IPPROTO_TCP && sk->sk_protocol != IPPROTO_UDP)) return -EOPNOTSUPP; return sk->sk_prot->diag_destroy(sk, ECONNABORTED); } __bpf_kfunc_end_defs(); BTF_KFUNCS_START(bpf_sk_iter_kfunc_ids) BTF_ID_FLAGS(func, bpf_sock_destroy, KF_TRUSTED_ARGS) BTF_KFUNCS_END(bpf_sk_iter_kfunc_ids) static int tracing_iter_filter(const struct bpf_prog *prog, u32 kfunc_id) { if (btf_id_set8_contains(&bpf_sk_iter_kfunc_ids, kfunc_id) && prog->expected_attach_type != BPF_TRACE_ITER) return -EACCES; return 0; } static const struct btf_kfunc_id_set bpf_sk_iter_kfunc_set = { .owner = THIS_MODULE, .set = &bpf_sk_iter_kfunc_ids, .filter = tracing_iter_filter, }; static int init_subsystem(void) { return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_sk_iter_kfunc_set); } late_initcall(init_subsystem);
21 13 5 13 155 164 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Scatterlist Cryptographic API. * * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> * Copyright (c) 2002 David S. Miller (davem@redhat.com) * Copyright (c) 2005 Herbert Xu <herbert@gondor.apana.org.au> * * Portions derived from Cryptoapi, by Alexander Kjeldaas <astor@fast.no> * and Nettle, by Niels Möller. */ #ifndef _LINUX_CRYPTO_H #define _LINUX_CRYPTO_H #include <linux/completion.h> #include <linux/errno.h> #include <linux/refcount_types.h> #include <linux/slab.h> #include <linux/types.h> /* * Algorithm masks and types. */ #define CRYPTO_ALG_TYPE_MASK 0x0000000f #define CRYPTO_ALG_TYPE_CIPHER 0x00000001 #define CRYPTO_ALG_TYPE_AEAD 0x00000003 #define CRYPTO_ALG_TYPE_LSKCIPHER 0x00000004 #define CRYPTO_ALG_TYPE_SKCIPHER 0x00000005 #define CRYPTO_ALG_TYPE_AKCIPHER 0x00000006 #define CRYPTO_ALG_TYPE_SIG 0x00000007 #define CRYPTO_ALG_TYPE_KPP 0x00000008 #define CRYPTO_ALG_TYPE_ACOMPRESS 0x0000000a #define CRYPTO_ALG_TYPE_SCOMPRESS 0x0000000b #define CRYPTO_ALG_TYPE_RNG 0x0000000c #define CRYPTO_ALG_TYPE_HASH 0x0000000e #define CRYPTO_ALG_TYPE_SHASH 0x0000000e #define CRYPTO_ALG_TYPE_AHASH 0x0000000f #define CRYPTO_ALG_TYPE_ACOMPRESS_MASK 0x0000000e #define CRYPTO_ALG_LARVAL 0x00000010 #define CRYPTO_ALG_DEAD 0x00000020 #define CRYPTO_ALG_DYING 0x00000040 #define CRYPTO_ALG_ASYNC 0x00000080 /* * Set if the algorithm (or an algorithm which it uses) requires another * algorithm of the same type to handle corner cases. */ #define CRYPTO_ALG_NEED_FALLBACK 0x00000100 /* * Set if the algorithm data structure should be duplicated into * kmalloc memory before registration. This is useful for hardware * that can be disconnected at will. Do not use this if the data * structure is embedded into a bigger one. Duplicate the overall * data structure in the driver in that case. */ #define CRYPTO_ALG_DUP_FIRST 0x00000200 /* * Set if the algorithm has passed automated run-time testing. Note that * if there is no run-time testing for a given algorithm it is considered * to have passed. */ #define CRYPTO_ALG_TESTED 0x00000400 /* * Set if the algorithm is an instance that is built from templates. */ #define CRYPTO_ALG_INSTANCE 0x00000800 /* Set this bit if the algorithm provided is hardware accelerated but * not available to userspace via instruction set or so. */ #define CRYPTO_ALG_KERN_DRIVER_ONLY 0x00001000 /* * Mark a cipher as a service implementation only usable by another * cipher and never by a normal user of the kernel crypto API */ #define CRYPTO_ALG_INTERNAL 0x00002000 /* * Set if the algorithm has a ->setkey() method but can be used without * calling it first, i.e. there is a default key. */ #define CRYPTO_ALG_OPTIONAL_KEY 0x00004000 /* * Don't trigger module loading */ #define CRYPTO_NOLOAD 0x00008000 /* * The algorithm may allocate memory during request processing, i.e. during * encryption, decryption, or hashing. Users can request an algorithm with this * flag unset if they can't handle memory allocation failures. * * This flag is currently only implemented for algorithms of type "skcipher", * "aead", "ahash", "shash", and "cipher". Algorithms of other types might not * have this flag set even if they allocate memory. * * In some edge cases, algorithms can allocate memory regardless of this flag. * To avoid these cases, users must obey the following usage constraints: * skcipher: * - The IV buffer and all scatterlist elements must be aligned to the * algorithm's alignmask. * - If the data were to be divided into chunks of size * crypto_skcipher_walksize() (with any remainder going at the end), no * chunk can cross a page boundary or a scatterlist element boundary. * aead: * - The IV buffer and all scatterlist elements must be aligned to the * algorithm's alignmask. * - The first scatterlist element must contain all the associated data, * and its pages must be !PageHighMem. * - If the plaintext/ciphertext were to be divided into chunks of size * crypto_aead_walksize() (with the remainder going at the end), no chunk * can cross a page boundary or a scatterlist element boundary. * ahash: * - crypto_ahash_finup() must not be used unless the algorithm implements * ->finup() natively. */ #define CRYPTO_ALG_ALLOCATES_MEMORY 0x00010000 /* * Mark an algorithm as a service implementation only usable by a * template and never by a normal user of the kernel crypto API. * This is intended to be used by algorithms that are themselves * not FIPS-approved but may instead be used to implement parts of * a FIPS-approved algorithm (e.g., dh vs. ffdhe2048(dh)). */ #define CRYPTO_ALG_FIPS_INTERNAL 0x00020000 /* Set if the algorithm supports virtual addresses. */ #define CRYPTO_ALG_REQ_VIRT 0x00040000 /* Set if the algorithm cannot have a fallback (e.g., phmac). */ #define CRYPTO_ALG_NO_FALLBACK 0x00080000 /* The high bits 0xff000000 are reserved for type-specific flags. */ /* * Transform masks and values (for crt_flags). */ #define CRYPTO_TFM_NEED_KEY 0x00000001 #define CRYPTO_TFM_REQ_MASK 0x000fff00 #define CRYPTO_TFM_REQ_FORBID_WEAK_KEYS 0x00000100 #define CRYPTO_TFM_REQ_MAY_SLEEP 0x00000200 #define CRYPTO_TFM_REQ_MAY_BACKLOG 0x00000400 #define CRYPTO_TFM_REQ_ON_STACK 0x00000800 /* * Miscellaneous stuff. */ #define CRYPTO_MAX_ALG_NAME 128 /* * The macro CRYPTO_MINALIGN_ATTR (along with the void * type in the actual * declaration) is used to ensure that the crypto_tfm context structure is * aligned correctly for the given architecture so that there are no alignment * faults for C data types. On architectures that support non-cache coherent * DMA, such as ARM or arm64, it also takes into account the minimal alignment * that is required to ensure that the context struct member does not share any * cachelines with the rest of the struct. This is needed to ensure that cache * maintenance for non-coherent DMA (cache invalidation in particular) does not * affect data that may be accessed by the CPU concurrently. */ #define CRYPTO_MINALIGN ARCH_KMALLOC_MINALIGN #define CRYPTO_MINALIGN_ATTR __attribute__ ((__aligned__(CRYPTO_MINALIGN))) struct crypto_tfm; struct crypto_type; struct module; typedef void (*crypto_completion_t)(void *req, int err); /** * DOC: Block Cipher Context Data Structures * * These data structures define the operating context for each block cipher * type. */ struct crypto_async_request { struct list_head list; crypto_completion_t complete; void *data; struct crypto_tfm *tfm; u32 flags; }; /** * DOC: Block Cipher Algorithm Definitions * * These data structures define modular crypto algorithm implementations, * managed via crypto_register_alg() and crypto_unregister_alg(). */ /** * struct cipher_alg - single-block symmetric ciphers definition * @cia_min_keysize: Minimum key size supported by the transformation. This is * the smallest key length supported by this transformation * algorithm. This must be set to one of the pre-defined * values as this is not hardware specific. Possible values * for this field can be found via git grep "_MIN_KEY_SIZE" * include/crypto/ * @cia_max_keysize: Maximum key size supported by the transformation. This is * the largest key length supported by this transformation * algorithm. This must be set to one of the pre-defined values * as this is not hardware specific. Possible values for this * field can be found via git grep "_MAX_KEY_SIZE" * include/crypto/ * @cia_setkey: Set key for the transformation. This function is used to either * program a supplied key into the hardware or store the key in the * transformation context for programming it later. Note that this * function does modify the transformation context. This function * can be called multiple times during the existence of the * transformation object, so one must make sure the key is properly * reprogrammed into the hardware. This function is also * responsible for checking the key length for validity. * @cia_encrypt: Encrypt a single block. This function is used to encrypt a * single block of data, which must be @cra_blocksize big. This * always operates on a full @cra_blocksize and it is not possible * to encrypt a block of smaller size. The supplied buffers must * therefore also be at least of @cra_blocksize size. Both the * input and output buffers are always aligned to @cra_alignmask. * In case either of the input or output buffer supplied by user * of the crypto API is not aligned to @cra_alignmask, the crypto * API will re-align the buffers. The re-alignment means that a * new buffer will be allocated, the data will be copied into the * new buffer, then the processing will happen on the new buffer, * then the data will be copied back into the original buffer and * finally the new buffer will be freed. In case a software * fallback was put in place in the @cra_init call, this function * might need to use the fallback if the algorithm doesn't support * all of the key sizes. In case the key was stored in * transformation context, the key might need to be re-programmed * into the hardware in this function. This function shall not * modify the transformation context, as this function may be * called in parallel with the same transformation object. * @cia_decrypt: Decrypt a single block. This is a reverse counterpart to * @cia_encrypt, and the conditions are exactly the same. * * All fields are mandatory and must be filled. */ struct cipher_alg { unsigned int cia_min_keysize; unsigned int cia_max_keysize; int (*cia_setkey)(struct crypto_tfm *tfm, const u8 *key, unsigned int keylen); void (*cia_encrypt)(struct crypto_tfm *tfm, u8 *dst, const u8 *src); void (*cia_decrypt)(struct crypto_tfm *tfm, u8 *dst, const u8 *src); }; #define cra_cipher cra_u.cipher /** * struct crypto_alg - definition of a cryptograpic cipher algorithm * @cra_flags: Flags describing this transformation. See include/linux/crypto.h * CRYPTO_ALG_* flags for the flags which go in here. Those are * used for fine-tuning the description of the transformation * algorithm. * @cra_blocksize: Minimum block size of this transformation. The size in bytes * of the smallest possible unit which can be transformed with * this algorithm. The users must respect this value. * In case of HASH transformation, it is possible for a smaller * block than @cra_blocksize to be passed to the crypto API for * transformation, in case of any other transformation type, an * error will be returned upon any attempt to transform smaller * than @cra_blocksize chunks. * @cra_ctxsize: Size of the operational context of the transformation. This * value informs the kernel crypto API about the memory size * needed to be allocated for the transformation context. * @cra_alignmask: For cipher, skcipher, lskcipher, and aead algorithms this is * 1 less than the alignment, in bytes, that the algorithm * implementation requires for input and output buffers. When * the crypto API is invoked with buffers that are not aligned * to this alignment, the crypto API automatically utilizes * appropriately aligned temporary buffers to comply with what * the algorithm needs. (For scatterlists this happens only if * the algorithm uses the skcipher_walk helper functions.) This * misalignment handling carries a performance penalty, so it is * preferred that algorithms do not set a nonzero alignmask. * Also, crypto API users may wish to allocate buffers aligned * to the alignmask of the algorithm being used, in order to * avoid the API having to realign them. Note: the alignmask is * not supported for hash algorithms and is always 0 for them. * @cra_reqsize: Size of the request context for this algorithm. * @cra_priority: Priority of this transformation implementation. In case * multiple transformations with same @cra_name are available to * the Crypto API, the kernel will use the one with highest * @cra_priority. * @cra_name: Generic name (usable by multiple implementations) of the * transformation algorithm. This is the name of the transformation * itself. This field is used by the kernel when looking up the * providers of particular transformation. * @cra_driver_name: Unique name of the transformation provider. This is the * name of the provider of the transformation. This can be any * arbitrary value, but in the usual case, this contains the * name of the chip or provider and the name of the * transformation algorithm. * @cra_type: Type of the cryptographic transformation. This is a pointer to * struct crypto_type, which implements callbacks common for all * transformation types. There are multiple options, such as * &crypto_skcipher_type, &crypto_ahash_type, &crypto_rng_type. * This field might be empty. In that case, there are no common * callbacks. This is the case for: cipher. * @cra_u: Callbacks implementing the transformation. This is a union of * multiple structures. Depending on the type of transformation selected * by @cra_type and @cra_flags above, the associated structure must be * filled with callbacks. This field might be empty. This is the case * for ahash, shash. * @cra_init: Deprecated, do not use. * @cra_exit: Deprecated, do not use. * @cra_u.cipher: Union member which contains a single-block symmetric cipher * definition. See @struct @cipher_alg. * @cra_module: Owner of this transformation implementation. Set to THIS_MODULE * @cra_list: internally used * @cra_users: internally used * @cra_refcnt: internally used * @cra_destroy: internally used * * The struct crypto_alg describes a generic Crypto API algorithm and is common * for all of the transformations. Any variable not documented here shall not * be used by a cipher implementation as it is internal to the Crypto API. */ struct crypto_alg { struct list_head cra_list; struct list_head cra_users; u32 cra_flags; unsigned int cra_blocksize; unsigned int cra_ctxsize; unsigned int cra_alignmask; unsigned int cra_reqsize; int cra_priority; refcount_t cra_refcnt; char cra_name[CRYPTO_MAX_ALG_NAME]; char cra_driver_name[CRYPTO_MAX_ALG_NAME]; const struct crypto_type *cra_type; union { struct cipher_alg cipher; } cra_u; int (*cra_init)(struct crypto_tfm *tfm); void (*cra_exit)(struct crypto_tfm *tfm); void (*cra_destroy)(struct crypto_alg *alg); struct module *cra_module; } CRYPTO_MINALIGN_ATTR; /* * A helper struct for waiting for completion of async crypto ops */ struct crypto_wait { struct completion completion; int err; }; /* * Macro for declaring a crypto op async wait object on stack */ #define DECLARE_CRYPTO_WAIT(_wait) \ struct crypto_wait _wait = { \ COMPLETION_INITIALIZER_ONSTACK((_wait).completion), 0 } /* * Async ops completion helper functioons */ void crypto_req_done(void *req, int err); static inline int crypto_wait_req(int err, struct crypto_wait *wait) { switch (err) { case -EINPROGRESS: case -EBUSY: wait_for_completion(&wait->completion); reinit_completion(&wait->completion); err = wait->err; break; } return err; } static inline void crypto_init_wait(struct crypto_wait *wait) { init_completion(&wait->completion); } /* * Algorithm query interface. */ int crypto_has_alg(const char *name, u32 type, u32 mask); /* * Transforms: user-instantiated objects which encapsulate algorithms * and core processing logic. Managed via crypto_alloc_*() and * crypto_free_*(), as well as the various helpers below. */ struct crypto_tfm { refcount_t refcnt; u32 crt_flags; int node; struct crypto_tfm *fb; void (*exit)(struct crypto_tfm *tfm); struct crypto_alg *__crt_alg; void *__crt_ctx[] CRYPTO_MINALIGN_ATTR; }; /* * Transform user interface. */ struct crypto_tfm *crypto_alloc_base(const char *alg_name, u32 type, u32 mask); void crypto_destroy_tfm(void *mem, struct crypto_tfm *tfm); static inline void crypto_free_tfm(struct crypto_tfm *tfm) { return crypto_destroy_tfm(tfm, tfm); } /* * Transform helpers which query the underlying algorithm. */ static inline const char *crypto_tfm_alg_name(struct crypto_tfm *tfm) { return tfm->__crt_alg->cra_name; } static inline const char *crypto_tfm_alg_driver_name(struct crypto_tfm *tfm) { return tfm->__crt_alg->cra_driver_name; } static inline unsigned int crypto_tfm_alg_blocksize(struct crypto_tfm *tfm) { return tfm->__crt_alg->cra_blocksize; } static inline unsigned int crypto_tfm_alg_alignmask(struct crypto_tfm *tfm) { return tfm->__crt_alg->cra_alignmask; } static inline unsigned int crypto_tfm_alg_reqsize(struct crypto_tfm *tfm) { return tfm->__crt_alg->cra_reqsize; } static inline u32 crypto_tfm_get_flags(struct crypto_tfm *tfm) { return tfm->crt_flags; } static inline void crypto_tfm_set_flags(struct crypto_tfm *tfm, u32 flags) { tfm->crt_flags |= flags; } static inline void crypto_tfm_clear_flags(struct crypto_tfm *tfm, u32 flags) { tfm->crt_flags &= ~flags; } static inline unsigned int crypto_tfm_ctx_alignment(void) { struct crypto_tfm *tfm; return __alignof__(tfm->__crt_ctx); } static inline bool crypto_tfm_is_async(struct crypto_tfm *tfm) { return tfm->__crt_alg->cra_flags & CRYPTO_ALG_ASYNC; } static inline bool crypto_req_on_stack(struct crypto_async_request *req) { return req->flags & CRYPTO_TFM_REQ_ON_STACK; } static inline void crypto_request_set_callback( struct crypto_async_request *req, u32 flags, crypto_completion_t compl, void *data) { u32 keep = CRYPTO_TFM_REQ_ON_STACK; req->complete = compl; req->data = data; req->flags &= keep; req->flags |= flags & ~keep; } static inline void crypto_request_set_tfm(struct crypto_async_request *req, struct crypto_tfm *tfm) { req->tfm = tfm; req->flags &= ~CRYPTO_TFM_REQ_ON_STACK; } struct crypto_async_request *crypto_request_clone( struct crypto_async_request *req, size_t total, gfp_t gfp); static inline void crypto_stack_request_init(struct crypto_async_request *req, struct crypto_tfm *tfm) { req->flags = 0; crypto_request_set_tfm(req, tfm); req->flags |= CRYPTO_TFM_REQ_ON_STACK; } #endif /* _LINUX_CRYPTO_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef __PSP_PSP_H #define __PSP_PSP_H #include <linux/list.h> #include <linux/lockdep.h> #include <linux/mutex.h> #include <net/netns/generic.h> #include <net/psp.h> #include <net/sock.h> extern struct xarray psp_devs; extern struct mutex psp_devs_lock; void psp_dev_free(struct psp_dev *psd); int psp_dev_check_access(struct psp_dev *psd, struct net *net); void psp_nl_notify_dev(struct psp_dev *psd, u32 cmd); struct psp_assoc *psp_assoc_create(struct psp_dev *psd); struct psp_dev *psp_dev_get_for_sock(struct sock *sk); void psp_dev_tx_key_del(struct psp_dev *psd, struct psp_assoc *pas); int psp_sock_assoc_set_rx(struct sock *sk, struct psp_assoc *pas, struct psp_key_parsed *key, struct netlink_ext_ack *extack); int psp_sock_assoc_set_tx(struct sock *sk, struct psp_dev *psd, u32 version, struct psp_key_parsed *key, struct netlink_ext_ack *extack); void psp_assocs_key_rotated(struct psp_dev *psd); static inline void psp_dev_get(struct psp_dev *psd) { refcount_inc(&psd->refcnt); } static inline bool psp_dev_tryget(struct psp_dev *psd) { return refcount_inc_not_zero(&psd->refcnt); } static inline void psp_dev_put(struct psp_dev *psd) { if (refcount_dec_and_test(&psd->refcnt)) psp_dev_free(psd); } static inline bool psp_dev_is_registered(struct psp_dev *psd) { lockdep_assert_held(&psd->lock); return !!psd->ops; } #endif /* __PSP_PSP_H */
17 17 13 1 1 12 12 12 3 8 1 1 6 3 3 4 4 2 6 6 12 1 2 9 8 6 4 6 1 1 1 11 11 3 3 7 1 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */ #include <linux/skmsg.h> #include <linux/filter.h> #include <linux/bpf.h> #include <linux/init.h> #include <linux/wait.h> #include <linux/util_macros.h> #include <net/inet_common.h> #include <net/tls.h> void tcp_eat_skb(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tcp; int copied; if (!skb || !skb->len || !sk_is_tcp(sk)) return; if (skb_bpf_strparser(skb)) return; tcp = tcp_sk(sk); copied = tcp->copied_seq + skb->len; WRITE_ONCE(tcp->copied_seq, copied); tcp_rcv_space_adjust(sk); __tcp_cleanup_rbuf(sk, skb->len); } static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg, u32 apply_bytes) { bool apply = apply_bytes; struct scatterlist *sge; u32 size, copied = 0; struct sk_msg *tmp; int i, ret = 0; tmp = kzalloc(sizeof(*tmp), __GFP_NOWARN | GFP_KERNEL); if (unlikely(!tmp)) return -ENOMEM; lock_sock(sk); tmp->sg.start = msg->sg.start; i = msg->sg.start; do { sge = sk_msg_elem(msg, i); size = (apply && apply_bytes < sge->length) ? apply_bytes : sge->length; if (!__sk_rmem_schedule(sk, size, false)) { if (!copied) ret = -ENOMEM; break; } sk_mem_charge(sk, size); atomic_add(size, &sk->sk_rmem_alloc); sk_msg_xfer(tmp, msg, i, size); copied += size; if (sge->length) get_page(sk_msg_page(tmp, i)); sk_msg_iter_var_next(i); tmp->sg.end = i; if (apply) { apply_bytes -= size; if (!apply_bytes) { if (sge->length) sk_msg_iter_var_prev(i); break; } } } while (i != msg->sg.end); if (!ret) { msg->sg.start = i; if (!sk_psock_queue_msg(psock, tmp)) atomic_sub(copied, &sk->sk_rmem_alloc); sk_psock_data_ready(sk, psock); } else { sk_msg_free(sk, tmp); kfree(tmp); } release_sock(sk); return ret; } static int tcp_bpf_push(struct sock *sk, struct sk_msg *msg, u32 apply_bytes, int flags, bool uncharge) { struct msghdr msghdr = {}; bool apply = apply_bytes; struct scatterlist *sge; struct page *page; int size, ret = 0; u32 off; while (1) { struct bio_vec bvec; bool has_tx_ulp; sge = sk_msg_elem(msg, msg->sg.start); size = (apply && apply_bytes < sge->length) ? apply_bytes : sge->length; off = sge->offset; page = sg_page(sge); tcp_rate_check_app_limited(sk); retry: msghdr.msg_flags = flags | MSG_SPLICE_PAGES; has_tx_ulp = tls_sw_has_ctx_tx(sk); if (has_tx_ulp) msghdr.msg_flags |= MSG_SENDPAGE_NOPOLICY; if (size < sge->length && msg->sg.start != msg->sg.end) msghdr.msg_flags |= MSG_MORE; bvec_set_page(&bvec, page, size, off); iov_iter_bvec(&msghdr.msg_iter, ITER_SOURCE, &bvec, 1, size); ret = tcp_sendmsg_locked(sk, &msghdr, size); if (ret <= 0) return ret; if (apply) apply_bytes -= ret; msg->sg.size -= ret; sge->offset += ret; sge->length -= ret; if (uncharge) sk_mem_uncharge(sk, ret); if (ret != size) { size -= ret; off += ret; goto retry; } if (!sge->length) { put_page(page); sk_msg_iter_next(msg, start); sg_init_table(sge, 1); if (msg->sg.start == msg->sg.end) break; } if (apply && !apply_bytes) break; } return 0; } static int tcp_bpf_push_locked(struct sock *sk, struct sk_msg *msg, u32 apply_bytes, int flags, bool uncharge) { int ret; lock_sock(sk); ret = tcp_bpf_push(sk, msg, apply_bytes, flags, uncharge); release_sock(sk); return ret; } int tcp_bpf_sendmsg_redir(struct sock *sk, bool ingress, struct sk_msg *msg, u32 bytes, int flags) { struct sk_psock *psock = sk_psock_get(sk); int ret; if (unlikely(!psock)) return -EPIPE; ret = ingress ? bpf_tcp_ingress(sk, psock, msg, bytes) : tcp_bpf_push_locked(sk, msg, bytes, flags, false); sk_psock_put(sk, psock); return ret; } EXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir); #ifdef CONFIG_BPF_SYSCALL static int tcp_msg_wait_data(struct sock *sk, struct sk_psock *psock, long timeo) { DEFINE_WAIT_FUNC(wait, woken_wake_function); int ret = 0; if (sk->sk_shutdown & RCV_SHUTDOWN) return 1; if (!timeo) return ret; add_wait_queue(sk_sleep(sk), &wait); sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); ret = sk_wait_event(sk, &timeo, !list_empty(&psock->ingress_msg) || !skb_queue_empty_lockless(&sk->sk_receive_queue), &wait); sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); remove_wait_queue(sk_sleep(sk), &wait); return ret; } static bool is_next_msg_fin(struct sk_psock *psock) { struct scatterlist *sge; struct sk_msg *msg_rx; int i; msg_rx = sk_psock_peek_msg(psock); i = msg_rx->sg.start; sge = sk_msg_elem(msg_rx, i); if (!sge->length) { struct sk_buff *skb = msg_rx->skb; if (skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) return true; } return false; } static int tcp_bpf_recvmsg_parser(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { int peek = flags & MSG_PEEK; struct sk_psock *psock; struct tcp_sock *tcp; int copied = 0; u32 seq; if (unlikely(flags & MSG_ERRQUEUE)) return inet_recv_error(sk, msg, len, addr_len); if (!len) return 0; psock = sk_psock_get(sk); if (unlikely(!psock)) return tcp_recvmsg(sk, msg, len, flags, addr_len); lock_sock(sk); tcp = tcp_sk(sk); seq = tcp->copied_seq; /* We may have received data on the sk_receive_queue pre-accept and * then we can not use read_skb in this context because we haven't * assigned a sk_socket yet so have no link to the ops. The work-around * is to check the sk_receive_queue and in these cases read skbs off * queue again. The read_skb hook is not running at this point because * of lock_sock so we avoid having multiple runners in read_skb. */ if (unlikely(!skb_queue_empty(&sk->sk_receive_queue))) { tcp_data_ready(sk); /* This handles the ENOMEM errors if we both receive data * pre accept and are already under memory pressure. At least * let user know to retry. */ if (unlikely(!skb_queue_empty(&sk->sk_receive_queue))) { copied = -EAGAIN; goto out; } } msg_bytes_ready: copied = sk_msg_recvmsg(sk, psock, msg, len, flags); /* The typical case for EFAULT is the socket was gracefully * shutdown with a FIN pkt. So check here the other case is * some error on copy_page_to_iter which would be unexpected. * On fin return correct return code to zero. */ if (copied == -EFAULT) { bool is_fin = is_next_msg_fin(psock); if (is_fin) { copied = 0; seq++; goto out; } } seq += copied; if (!copied) { long timeo; int data; if (sock_flag(sk, SOCK_DONE)) goto out; if (sk->sk_err) { copied = sock_error(sk); goto out; } if (sk->sk_shutdown & RCV_SHUTDOWN) goto out; if (sk->sk_state == TCP_CLOSE) { copied = -ENOTCONN; goto out; } timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); if (!timeo) { copied = -EAGAIN; goto out; } if (signal_pending(current)) { copied = sock_intr_errno(timeo); goto out; } data = tcp_msg_wait_data(sk, psock, timeo); if (data < 0) { copied = data; goto unlock; } if (data && !sk_psock_queue_empty(psock)) goto msg_bytes_ready; copied = -EAGAIN; } out: if (!peek) WRITE_ONCE(tcp->copied_seq, seq); tcp_rcv_space_adjust(sk); if (copied > 0) __tcp_cleanup_rbuf(sk, copied); unlock: release_sock(sk); sk_psock_put(sk, psock); return copied; } static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { struct sk_psock *psock; int copied, ret; if (unlikely(flags & MSG_ERRQUEUE)) return inet_recv_error(sk, msg, len, addr_len); if (!len) return 0; psock = sk_psock_get(sk); if (unlikely(!psock)) return tcp_recvmsg(sk, msg, len, flags, addr_len); if (!skb_queue_empty(&sk->sk_receive_queue) && sk_psock_queue_empty(psock)) { sk_psock_put(sk, psock); return tcp_recvmsg(sk, msg, len, flags, addr_len); } lock_sock(sk); msg_bytes_ready: copied = sk_msg_recvmsg(sk, psock, msg, len, flags); if (!copied) { long timeo; int data; timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); data = tcp_msg_wait_data(sk, psock, timeo); if (data < 0) { ret = data; goto unlock; } if (data) { if (!sk_psock_queue_empty(psock)) goto msg_bytes_ready; release_sock(sk); sk_psock_put(sk, psock); return tcp_recvmsg(sk, msg, len, flags, addr_len); } copied = -EAGAIN; } ret = copied; unlock: release_sock(sk); sk_psock_put(sk, psock); return ret; } static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg, int *copied, int flags) { bool cork = false, enospc = sk_msg_full(msg), redir_ingress; struct sock *sk_redir; u32 tosend, origsize, sent, delta = 0; u32 eval; int ret; more_data: if (psock->eval == __SK_NONE) { /* Track delta in msg size to add/subtract it on SK_DROP from * returned to user copied size. This ensures user doesn't * get a positive return code with msg_cut_data and SK_DROP * verdict. */ delta = msg->sg.size; psock->eval = sk_psock_msg_verdict(sk, psock, msg); delta -= msg->sg.size; } if (msg->cork_bytes && msg->cork_bytes > msg->sg.size && !enospc) { psock->cork_bytes = msg->cork_bytes - msg->sg.size; if (!psock->cork) { psock->cork = kzalloc(sizeof(*psock->cork), GFP_ATOMIC | __GFP_NOWARN); if (!psock->cork) { sk_msg_free(sk, msg); *copied = 0; return -ENOMEM; } } memcpy(psock->cork, msg, sizeof(*msg)); return 0; } tosend = msg->sg.size; if (psock->apply_bytes && psock->apply_bytes < tosend) tosend = psock->apply_bytes; eval = __SK_NONE; switch (psock->eval) { case __SK_PASS: ret = tcp_bpf_push(sk, msg, tosend, flags, true); if (unlikely(ret)) { *copied -= sk_msg_free(sk, msg); break; } sk_msg_apply_bytes(psock, tosend); break; case __SK_REDIRECT: redir_ingress = psock->redir_ingress; sk_redir = psock->sk_redir; sk_msg_apply_bytes(psock, tosend); if (!psock->apply_bytes) { /* Clean up before releasing the sock lock. */ eval = psock->eval; psock->eval = __SK_NONE; psock->sk_redir = NULL; } if (psock->cork) { cork = true; psock->cork = NULL; } release_sock(sk); origsize = msg->sg.size; ret = tcp_bpf_sendmsg_redir(sk_redir, redir_ingress, msg, tosend, flags); sent = origsize - msg->sg.size; if (eval == __SK_REDIRECT) sock_put(sk_redir); lock_sock(sk); sk_mem_uncharge(sk, sent); if (unlikely(ret < 0)) { int free = sk_msg_free(sk, msg); if (!cork) *copied -= free; } if (cork) { sk_msg_free(sk, msg); kfree(msg); msg = NULL; ret = 0; } break; case __SK_DROP: default: sk_msg_free(sk, msg); sk_msg_apply_bytes(psock, tosend); *copied -= (tosend + delta); return -EACCES; } if (likely(!ret)) { if (!psock->apply_bytes) { psock->eval = __SK_NONE; if (psock->sk_redir) { sock_put(psock->sk_redir); psock->sk_redir = NULL; } } if (msg && msg->sg.data[msg->sg.start].page_link && msg->sg.data[msg->sg.start].length) goto more_data; } return ret; } static int tcp_bpf_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { struct sk_msg tmp, *msg_tx = NULL; int copied = 0, err = 0, ret = 0; struct sk_psock *psock; long timeo; int flags; /* Don't let internal flags through */ flags = (msg->msg_flags & ~MSG_SENDPAGE_DECRYPTED); flags |= MSG_NO_SHARED_FRAGS; psock = sk_psock_get(sk); if (unlikely(!psock)) return tcp_sendmsg(sk, msg, size); lock_sock(sk); timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); while (msg_data_left(msg)) { bool enospc = false; u32 copy, osize; if (sk->sk_err) { err = -sk->sk_err; goto out_err; } copy = msg_data_left(msg); if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; if (psock->cork) { msg_tx = psock->cork; } else { msg_tx = &tmp; sk_msg_init(msg_tx); } osize = msg_tx->sg.size; err = sk_msg_alloc(sk, msg_tx, msg_tx->sg.size + copy, msg_tx->sg.end - 1); if (err) { if (err != -ENOSPC) goto wait_for_memory; enospc = true; copy = msg_tx->sg.size - osize; } ret = sk_msg_memcopy_from_iter(sk, &msg->msg_iter, msg_tx, copy); if (ret < 0) { sk_msg_trim(sk, msg_tx, osize); goto out_err; } copied += ret; if (psock->cork_bytes) { if (size > psock->cork_bytes) psock->cork_bytes = 0; else psock->cork_bytes -= size; if (psock->cork_bytes && !enospc) goto out_err; /* All cork bytes are accounted, rerun the prog. */ psock->eval = __SK_NONE; psock->cork_bytes = 0; } err = tcp_bpf_send_verdict(sk, psock, msg_tx, &copied, flags); if (unlikely(err < 0)) goto out_err; continue; wait_for_sndbuf: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); wait_for_memory: err = sk_stream_wait_memory(sk, &timeo); if (err) { if (msg_tx && msg_tx != psock->cork) sk_msg_free(sk, msg_tx); goto out_err; } } out_err: if (err < 0) err = sk_stream_error(sk, msg->msg_flags, err); release_sock(sk); sk_psock_put(sk, psock); return copied > 0 ? copied : err; } enum { TCP_BPF_IPV4, TCP_BPF_IPV6, TCP_BPF_NUM_PROTS, }; enum { TCP_BPF_BASE, TCP_BPF_TX, TCP_BPF_RX, TCP_BPF_TXRX, TCP_BPF_NUM_CFGS, }; static struct proto *tcpv6_prot_saved __read_mostly; static DEFINE_SPINLOCK(tcpv6_prot_lock); static struct proto tcp_bpf_prots[TCP_BPF_NUM_PROTS][TCP_BPF_NUM_CFGS]; static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS], struct proto *base) { prot[TCP_BPF_BASE] = *base; prot[TCP_BPF_BASE].destroy = sock_map_destroy; prot[TCP_BPF_BASE].close = sock_map_close; prot[TCP_BPF_BASE].recvmsg = tcp_bpf_recvmsg; prot[TCP_BPF_BASE].sock_is_readable = sk_msg_is_readable; prot[TCP_BPF_TX] = prot[TCP_BPF_BASE]; prot[TCP_BPF_TX].sendmsg = tcp_bpf_sendmsg; prot[TCP_BPF_RX] = prot[TCP_BPF_BASE]; prot[TCP_BPF_RX].recvmsg = tcp_bpf_recvmsg_parser; prot[TCP_BPF_TXRX] = prot[TCP_BPF_TX]; prot[TCP_BPF_TXRX].recvmsg = tcp_bpf_recvmsg_parser; } static void tcp_bpf_check_v6_needs_rebuild(struct proto *ops) { if (unlikely(ops != smp_load_acquire(&tcpv6_prot_saved))) { spin_lock_bh(&tcpv6_prot_lock); if (likely(ops != tcpv6_prot_saved)) { tcp_bpf_rebuild_protos(tcp_bpf_prots[TCP_BPF_IPV6], ops); smp_store_release(&tcpv6_prot_saved, ops); } spin_unlock_bh(&tcpv6_prot_lock); } } static int __init tcp_bpf_v4_build_proto(void) { tcp_bpf_rebuild_protos(tcp_bpf_prots[TCP_BPF_IPV4], &tcp_prot); return 0; } late_initcall(tcp_bpf_v4_build_proto); static int tcp_bpf_assert_proto_ops(struct proto *ops) { /* In order to avoid retpoline, we make assumptions when we call * into ops if e.g. a psock is not present. Make sure they are * indeed valid assumptions. */ return ops->recvmsg == tcp_recvmsg && ops->sendmsg == tcp_sendmsg ? 0 : -ENOTSUPP; } #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) int tcp_bpf_strp_read_sock(struct strparser *strp, read_descriptor_t *desc, sk_read_actor_t recv_actor) { struct sock *sk = strp->sk; struct sk_psock *psock; struct tcp_sock *tp; int copied = 0; tp = tcp_sk(sk); rcu_read_lock(); psock = sk_psock(sk); if (WARN_ON_ONCE(!psock)) { desc->error = -EINVAL; goto out; } psock->ingress_bytes = 0; copied = tcp_read_sock_noack(sk, desc, recv_actor, true, &psock->copied_seq); if (copied < 0) goto out; /* recv_actor may redirect skb to another socket (SK_REDIRECT) or * just put skb into ingress queue of current socket (SK_PASS). * For SK_REDIRECT, we need to ack the frame immediately but for * SK_PASS, we want to delay the ack until tcp_bpf_recvmsg_parser(). */ tp->copied_seq = psock->copied_seq - psock->ingress_bytes; tcp_rcv_space_adjust(sk); __tcp_cleanup_rbuf(sk, copied - psock->ingress_bytes); out: rcu_read_unlock(); return copied; } #endif /* CONFIG_BPF_STREAM_PARSER */ int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) { int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4; int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE; if (psock->progs.stream_verdict || psock->progs.skb_verdict) { config = (config == TCP_BPF_TX) ? TCP_BPF_TXRX : TCP_BPF_RX; } if (restore) { if (inet_csk_has_ulp(sk)) { /* TLS does not have an unhash proto in SW cases, * but we need to ensure we stop using the sock_map * unhash routine because the associated psock is being * removed. So use the original unhash handler. */ WRITE_ONCE(sk->sk_prot->unhash, psock->saved_unhash); tcp_update_ulp(sk, psock->sk_proto, psock->saved_write_space); } else { sk->sk_write_space = psock->saved_write_space; /* Pairs with lockless read in sk_clone_lock() */ sock_replace_proto(sk, psock->sk_proto); } return 0; } if (sk->sk_family == AF_INET6) { if (tcp_bpf_assert_proto_ops(psock->sk_proto)) return -EINVAL; tcp_bpf_check_v6_needs_rebuild(psock->sk_proto); } /* Pairs with lockless read in sk_clone_lock() */ sock_replace_proto(sk, &tcp_bpf_prots[family][config]); return 0; } EXPORT_SYMBOL_GPL(tcp_bpf_update_proto); /* If a child got cloned from a listening socket that had tcp_bpf * protocol callbacks installed, we need to restore the callbacks to * the default ones because the child does not inherit the psock state * that tcp_bpf callbacks expect. */ void tcp_bpf_clone(const struct sock *sk, struct sock *newsk) { struct proto *prot = newsk->sk_prot; if (is_insidevar(prot, tcp_bpf_prots)) newsk->sk_prot = sk->sk_prot_creator; } #endif /* CONFIG_BPF_SYSCALL */
1 3 1452 1444 11 3 1086 1084 1085 265 19 100 764 52 20 68 8 212 706 1021 13 16 13 3 1031 17 27 11 12 6 6 11 2 25 20 2 22 13 9 1050 1052 22 9 3 36 44 5 8338 1378 1327 52 8394 1287 31 264 1175 32 3456 1 112 12 6 1 493 1300 5 1364 1555 619 27 4173 77 31 85 2361 5865 5866 8608 1074 48 67 271 10567 1802 10392 10326 724 55 720 8880 1708 8510 8848 2285 2281 2283 283 10405 280 195 223 2191 2191 1202 1202 1205 1397 1402 3 7 5 2 2643 2644 1704 2616 1744 1418 7 1670 1669 1664 1411 1416 1418 7 7 7 4035 2389 44 4039 4039 4042 2387 2393 2390 44 43 44 20 20 20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 // SPDX-License-Identifier: GPL-2.0 /* * NETLINK Netlink attributes * * Authors: Thomas Graf <tgraf@suug.ch> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> */ #include <linux/export.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/jiffies.h> #include <linux/nospec.h> #include <linux/skbuff.h> #include <linux/string.h> #include <linux/types.h> #include <net/netlink.h> /* For these data types, attribute length should be exactly the given * size. However, to maintain compatibility with broken commands, if the * attribute length does not match the expected size a warning is emitted * to the user that the command is sending invalid data and needs to be fixed. */ static const u8 nla_attr_len[NLA_TYPE_MAX+1] = { [NLA_U8] = sizeof(u8), [NLA_U16] = sizeof(u16), [NLA_U32] = sizeof(u32), [NLA_U64] = sizeof(u64), [NLA_S8] = sizeof(s8), [NLA_S16] = sizeof(s16), [NLA_S32] = sizeof(s32), [NLA_S64] = sizeof(s64), [NLA_BE16] = sizeof(__be16), [NLA_BE32] = sizeof(__be32), }; static const u8 nla_attr_minlen[NLA_TYPE_MAX+1] = { [NLA_U8] = sizeof(u8), [NLA_U16] = sizeof(u16), [NLA_U32] = sizeof(u32), [NLA_U64] = sizeof(u64), [NLA_MSECS] = sizeof(u64), [NLA_NESTED] = NLA_HDRLEN, [NLA_S8] = sizeof(s8), [NLA_S16] = sizeof(s16), [NLA_S32] = sizeof(s32), [NLA_S64] = sizeof(s64), [NLA_BE16] = sizeof(__be16), [NLA_BE32] = sizeof(__be32), }; /* * Nested policies might refer back to the original * policy in some cases, and userspace could try to * abuse that and recurse by nesting in the right * ways. Limit recursion to avoid this problem. */ #define MAX_POLICY_RECURSION_DEPTH 10 static int __nla_validate_parse(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack, struct nlattr **tb, unsigned int depth); static int validate_nla_bitfield32(const struct nlattr *nla, const u32 valid_flags_mask) { const struct nla_bitfield32 *bf = nla_data(nla); if (!valid_flags_mask) return -EINVAL; /*disallow invalid bit selector */ if (bf->selector & ~valid_flags_mask) return -EINVAL; /*disallow invalid bit values */ if (bf->value & ~valid_flags_mask) return -EINVAL; /*disallow valid bit values that are not selected*/ if (bf->value & ~bf->selector) return -EINVAL; return 0; } static int nla_validate_array(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack, unsigned int validate, unsigned int depth) { const struct nlattr *entry; int rem; nla_for_each_attr(entry, head, len, rem) { int ret; if (nla_len(entry) == 0) continue; if (nla_len(entry) < NLA_HDRLEN) { NL_SET_ERR_MSG_ATTR_POL(extack, entry, policy, "Array element too short"); return -ERANGE; } ret = __nla_validate_parse(nla_data(entry), nla_len(entry), maxtype, policy, validate, extack, NULL, depth + 1); if (ret < 0) return ret; } return 0; } void nla_get_range_unsigned(const struct nla_policy *pt, struct netlink_range_validation *range) { WARN_ON_ONCE(pt->validation_type != NLA_VALIDATE_RANGE_PTR && (pt->min < 0 || pt->max < 0)); range->min = 0; switch (pt->type) { case NLA_U8: range->max = U8_MAX; break; case NLA_U16: case NLA_BE16: case NLA_BINARY: range->max = U16_MAX; break; case NLA_U32: case NLA_BE32: range->max = U32_MAX; break; case NLA_U64: case NLA_UINT: case NLA_MSECS: range->max = U64_MAX; break; default: WARN_ON_ONCE(1); return; } switch (pt->validation_type) { case NLA_VALIDATE_RANGE: case NLA_VALIDATE_RANGE_WARN_TOO_LONG: range->min = pt->min; range->max = pt->max; break; case NLA_VALIDATE_RANGE_PTR: *range = *pt->range; break; case NLA_VALIDATE_MIN: range->min = pt->min; break; case NLA_VALIDATE_MAX: range->max = pt->max; break; default: break; } } static int nla_validate_range_unsigned(const struct nla_policy *pt, const struct nlattr *nla, struct netlink_ext_ack *extack, unsigned int validate) { struct netlink_range_validation range; u64 value; switch (pt->type) { case NLA_U8: value = nla_get_u8(nla); break; case NLA_U16: value = nla_get_u16(nla); break; case NLA_U32: value = nla_get_u32(nla); break; case NLA_U64: value = nla_get_u64(nla); break; case NLA_UINT: value = nla_get_uint(nla); break; case NLA_MSECS: value = nla_get_u64(nla); break; case NLA_BINARY: value = nla_len(nla); break; case NLA_BE16: value = ntohs(nla_get_be16(nla)); break; case NLA_BE32: value = ntohl(nla_get_be32(nla)); break; default: return -EINVAL; } nla_get_range_unsigned(pt, &range); if (pt->validation_type == NLA_VALIDATE_RANGE_WARN_TOO_LONG && pt->type == NLA_BINARY && value > range.max) { pr_warn_ratelimited("netlink: '%s': attribute type %d has an invalid length.\n", current->comm, pt->type); if (validate & NL_VALIDATE_STRICT_ATTRS) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "invalid attribute length"); return -EINVAL; } /* this assumes min <= max (don't validate against min) */ return 0; } if (value < range.min || value > range.max) { bool binary = pt->type == NLA_BINARY; if (binary) NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "binary attribute size out of range"); else NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "integer out of range"); return -ERANGE; } return 0; } void nla_get_range_signed(const struct nla_policy *pt, struct netlink_range_validation_signed *range) { switch (pt->type) { case NLA_S8: range->min = S8_MIN; range->max = S8_MAX; break; case NLA_S16: range->min = S16_MIN; range->max = S16_MAX; break; case NLA_S32: range->min = S32_MIN; range->max = S32_MAX; break; case NLA_S64: case NLA_SINT: range->min = S64_MIN; range->max = S64_MAX; break; default: WARN_ON_ONCE(1); return; } switch (pt->validation_type) { case NLA_VALIDATE_RANGE: range->min = pt->min; range->max = pt->max; break; case NLA_VALIDATE_RANGE_PTR: *range = *pt->range_signed; break; case NLA_VALIDATE_MIN: range->min = pt->min; break; case NLA_VALIDATE_MAX: range->max = pt->max; break; default: break; } } static int nla_validate_int_range_signed(const struct nla_policy *pt, const struct nlattr *nla, struct netlink_ext_ack *extack) { struct netlink_range_validation_signed range; s64 value; switch (pt->type) { case NLA_S8: value = nla_get_s8(nla); break; case NLA_S16: value = nla_get_s16(nla); break; case NLA_S32: value = nla_get_s32(nla); break; case NLA_S64: value = nla_get_s64(nla); break; case NLA_SINT: value = nla_get_sint(nla); break; default: return -EINVAL; } nla_get_range_signed(pt, &range); if (value < range.min || value > range.max) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "integer out of range"); return -ERANGE; } return 0; } static int nla_validate_int_range(const struct nla_policy *pt, const struct nlattr *nla, struct netlink_ext_ack *extack, unsigned int validate) { switch (pt->type) { case NLA_U8: case NLA_U16: case NLA_U32: case NLA_U64: case NLA_UINT: case NLA_MSECS: case NLA_BINARY: case NLA_BE16: case NLA_BE32: return nla_validate_range_unsigned(pt, nla, extack, validate); case NLA_S8: case NLA_S16: case NLA_S32: case NLA_S64: case NLA_SINT: return nla_validate_int_range_signed(pt, nla, extack); default: WARN_ON(1); return -EINVAL; } } static int nla_validate_mask(const struct nla_policy *pt, const struct nlattr *nla, struct netlink_ext_ack *extack) { u64 value; switch (pt->type) { case NLA_U8: value = nla_get_u8(nla); break; case NLA_U16: value = nla_get_u16(nla); break; case NLA_U32: value = nla_get_u32(nla); break; case NLA_U64: value = nla_get_u64(nla); break; case NLA_UINT: value = nla_get_uint(nla); break; case NLA_BE16: value = ntohs(nla_get_be16(nla)); break; case NLA_BE32: value = ntohl(nla_get_be32(nla)); break; default: return -EINVAL; } if (value & ~(u64)pt->mask) { NL_SET_ERR_MSG_ATTR(extack, nla, "reserved bit set"); return -EINVAL; } return 0; } static int validate_nla(const struct nlattr *nla, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack, unsigned int depth) { u16 strict_start_type = policy[0].strict_start_type; const struct nla_policy *pt; int minlen = 0, attrlen = nla_len(nla), type = nla_type(nla); int err = -ERANGE; if (strict_start_type && type >= strict_start_type) validate |= NL_VALIDATE_STRICT; if (type <= 0 || type > maxtype) return 0; type = array_index_nospec(type, maxtype + 1); pt = &policy[type]; BUG_ON(pt->type > NLA_TYPE_MAX); if (nla_attr_len[pt->type] && attrlen != nla_attr_len[pt->type]) { pr_warn_ratelimited("netlink: '%s': attribute type %d has an invalid length.\n", current->comm, type); if (validate & NL_VALIDATE_STRICT_ATTRS) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "invalid attribute length"); return -EINVAL; } } if (validate & NL_VALIDATE_NESTED) { if ((pt->type == NLA_NESTED || pt->type == NLA_NESTED_ARRAY) && !(nla->nla_type & NLA_F_NESTED)) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "NLA_F_NESTED is missing"); return -EINVAL; } if (pt->type != NLA_NESTED && pt->type != NLA_NESTED_ARRAY && pt->type != NLA_UNSPEC && (nla->nla_type & NLA_F_NESTED)) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "NLA_F_NESTED not expected"); return -EINVAL; } } switch (pt->type) { case NLA_REJECT: if (extack && pt->reject_message) { NL_SET_BAD_ATTR(extack, nla); extack->_msg = pt->reject_message; return -EINVAL; } err = -EINVAL; goto out_err; case NLA_FLAG: if (attrlen > 0) goto out_err; break; case NLA_SINT: case NLA_UINT: if (attrlen != sizeof(u32) && attrlen != sizeof(u64)) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "invalid attribute length"); return -EINVAL; } break; case NLA_BITFIELD32: if (attrlen != sizeof(struct nla_bitfield32)) goto out_err; err = validate_nla_bitfield32(nla, pt->bitfield32_valid); if (err) goto out_err; break; case NLA_NUL_STRING: if (pt->len) minlen = min_t(int, attrlen, pt->len + 1); else minlen = attrlen; if (!minlen || memchr(nla_data(nla), '\0', minlen) == NULL) { err = -EINVAL; goto out_err; } fallthrough; case NLA_STRING: if (attrlen < 1) goto out_err; if (pt->len) { char *buf = nla_data(nla); if (buf[attrlen - 1] == '\0') attrlen--; if (attrlen > pt->len) goto out_err; } break; case NLA_BINARY: if (pt->len && attrlen > pt->len) goto out_err; break; case NLA_NESTED: /* a nested attributes is allowed to be empty; if its not, * it must have a size of at least NLA_HDRLEN. */ if (attrlen == 0) break; if (attrlen < NLA_HDRLEN) goto out_err; if (pt->nested_policy) { err = __nla_validate_parse(nla_data(nla), nla_len(nla), pt->len, pt->nested_policy, validate, extack, NULL, depth + 1); if (err < 0) { /* * return directly to preserve the inner * error message/attribute pointer */ return err; } } break; case NLA_NESTED_ARRAY: /* a nested array attribute is allowed to be empty; if its not, * it must have a size of at least NLA_HDRLEN. */ if (attrlen == 0) break; if (attrlen < NLA_HDRLEN) goto out_err; if (pt->nested_policy) { int err; err = nla_validate_array(nla_data(nla), nla_len(nla), pt->len, pt->nested_policy, extack, validate, depth); if (err < 0) { /* * return directly to preserve the inner * error message/attribute pointer */ return err; } } break; case NLA_UNSPEC: if (validate & NL_VALIDATE_UNSPEC) { NL_SET_ERR_MSG_ATTR(extack, nla, "Unsupported attribute"); return -EINVAL; } if (attrlen < pt->len) goto out_err; break; default: if (pt->len) minlen = pt->len; else minlen = nla_attr_minlen[pt->type]; if (attrlen < minlen) goto out_err; } /* further validation */ switch (pt->validation_type) { case NLA_VALIDATE_NONE: /* nothing to do */ break; case NLA_VALIDATE_RANGE_PTR: case NLA_VALIDATE_RANGE: case NLA_VALIDATE_RANGE_WARN_TOO_LONG: case NLA_VALIDATE_MIN: case NLA_VALIDATE_MAX: err = nla_validate_int_range(pt, nla, extack, validate); if (err) return err; break; case NLA_VALIDATE_MASK: err = nla_validate_mask(pt, nla, extack); if (err) return err; break; case NLA_VALIDATE_FUNCTION: if (pt->validate) { err = pt->validate(nla, extack); if (err) return err; } break; } return 0; out_err: NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "Attribute failed policy validation"); return err; } static int __nla_validate_parse(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack, struct nlattr **tb, unsigned int depth) { const struct nlattr *nla; int rem; if (depth >= MAX_POLICY_RECURSION_DEPTH) { NL_SET_ERR_MSG(extack, "allowed policy recursion depth exceeded"); return -EINVAL; } if (tb) memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1)); nla_for_each_attr(nla, head, len, rem) { u16 type = nla_type(nla); if (type == 0 || type > maxtype) { if (validate & NL_VALIDATE_MAXTYPE) { NL_SET_ERR_MSG_ATTR(extack, nla, "Unknown attribute type"); return -EINVAL; } continue; } type = array_index_nospec(type, maxtype + 1); if (policy) { int err = validate_nla(nla, maxtype, policy, validate, extack, depth); if (err < 0) return err; } if (tb) tb[type] = (struct nlattr *)nla; } if (unlikely(rem > 0)) { pr_warn_ratelimited("netlink: %d bytes leftover after parsing attributes in process `%s'.\n", rem, current->comm); NL_SET_ERR_MSG(extack, "bytes leftover after parsing attributes"); if (validate & NL_VALIDATE_TRAILING) return -EINVAL; } return 0; } /** * __nla_validate - Validate a stream of attributes * @head: head of attribute stream * @len: length of attribute stream * @maxtype: maximum attribute type to be expected * @policy: validation policy * @validate: validation strictness * @extack: extended ACK report struct * * Validates all attributes in the specified attribute stream against the * specified policy. Validation depends on the validate flags passed, see * &enum netlink_validation for more details on that. * See documentation of struct nla_policy for more details. * * Returns 0 on success or a negative error code. */ int __nla_validate(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack) { return __nla_validate_parse(head, len, maxtype, policy, validate, extack, NULL, 0); } EXPORT_SYMBOL(__nla_validate); /** * nla_policy_len - Determine the max. length of a policy * @p: policy to use * @n: number of policies * * Determines the max. length of the policy. It is currently used * to allocated Netlink buffers roughly the size of the actual * message. * * Returns 0 on success or a negative error code. */ int nla_policy_len(const struct nla_policy *p, int n) { int i, len = 0; for (i = 0; i < n; i++, p++) { if (p->len) len += nla_total_size(p->len); else if (nla_attr_len[p->type]) len += nla_total_size(nla_attr_len[p->type]); else if (nla_attr_minlen[p->type]) len += nla_total_size(nla_attr_minlen[p->type]); } return len; } EXPORT_SYMBOL(nla_policy_len); /** * __nla_parse - Parse a stream of attributes into a tb buffer * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @head: head of attribute stream * @len: length of attribute stream * @policy: validation policy * @validate: validation strictness * @extack: extended ACK pointer * * Parses a stream of attributes and stores a pointer to each attribute in * the tb array accessible via the attribute type. * Validation is controlled by the @validate parameter. * * Returns 0 on success or a negative error code. */ int __nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head, int len, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack) { return __nla_validate_parse(head, len, maxtype, policy, validate, extack, tb, 0); } EXPORT_SYMBOL(__nla_parse); /** * nla_find - Find a specific attribute in a stream of attributes * @head: head of attribute stream * @len: length of attribute stream * @attrtype: type of attribute to look for * * Returns the first attribute in the stream matching the specified type. */ struct nlattr *nla_find(const struct nlattr *head, int len, int attrtype) { const struct nlattr *nla; int rem; nla_for_each_attr(nla, head, len, rem) if (nla_type(nla) == attrtype) return (struct nlattr *)nla; return NULL; } EXPORT_SYMBOL(nla_find); /** * nla_strscpy - Copy string attribute payload into a sized buffer * @dst: Where to copy the string to. * @nla: Attribute to copy the string from. * @dstsize: Size of destination buffer. * * Copies at most dstsize - 1 bytes into the destination buffer. * Unlike strscpy() the destination buffer is always padded out. * * Return: * * srclen - Returns @nla length (not including the trailing %NUL). * * -E2BIG - If @dstsize is 0 or greater than U16_MAX or @nla length greater * than @dstsize. */ ssize_t nla_strscpy(char *dst, const struct nlattr *nla, size_t dstsize) { size_t srclen = nla_len(nla); char *src = nla_data(nla); ssize_t ret; size_t len; if (dstsize == 0 || WARN_ON_ONCE(dstsize > U16_MAX)) return -E2BIG; if (srclen > 0 && src[srclen - 1] == '\0') srclen--; if (srclen >= dstsize) { len = dstsize - 1; ret = -E2BIG; } else { len = srclen; ret = len; } memcpy(dst, src, len); /* Zero pad end of dst. */ memset(dst + len, 0, dstsize - len); return ret; } EXPORT_SYMBOL(nla_strscpy); /** * nla_strdup - Copy string attribute payload into a newly allocated buffer * @nla: attribute to copy the string from * @flags: the type of memory to allocate (see kmalloc). * * Returns a pointer to the allocated buffer or NULL on error. */ char *nla_strdup(const struct nlattr *nla, gfp_t flags) { size_t srclen = nla_len(nla); char *src = nla_data(nla), *dst; if (srclen > 0 && src[srclen - 1] == '\0') srclen--; dst = kmalloc(srclen + 1, flags); if (dst != NULL) { memcpy(dst, src, srclen); dst[srclen] = '\0'; } return dst; } EXPORT_SYMBOL(nla_strdup); /** * nla_memcpy - Copy a netlink attribute into another memory area * @dest: where to copy to memcpy * @src: netlink attribute to copy from * @count: size of the destination area * * Note: The number of bytes copied is limited by the length of * attribute's payload. memcpy * * Returns the number of bytes copied. */ int nla_memcpy(void *dest, const struct nlattr *src, int count) { int minlen = min_t(int, count, nla_len(src)); memcpy(dest, nla_data(src), minlen); if (count > minlen) memset(dest + minlen, 0, count - minlen); return minlen; } EXPORT_SYMBOL(nla_memcpy); /** * nla_memcmp - Compare an attribute with sized memory area * @nla: netlink attribute * @data: memory area * @size: size of memory area */ int nla_memcmp(const struct nlattr *nla, const void *data, size_t size) { int d = nla_len(nla) - size; if (d == 0) d = memcmp(nla_data(nla), data, size); return d; } EXPORT_SYMBOL(nla_memcmp); /** * nla_strcmp - Compare a string attribute against a string * @nla: netlink string attribute * @str: another string */ int nla_strcmp(const struct nlattr *nla, const char *str) { int len = strlen(str); char *buf = nla_data(nla); int attrlen = nla_len(nla); int d; while (attrlen > 0 && buf[attrlen - 1] == '\0') attrlen--; d = attrlen - len; if (d == 0) d = memcmp(nla_data(nla), str, len); return d; } EXPORT_SYMBOL(nla_strcmp); #ifdef CONFIG_NET /** * __nla_reserve - reserve room for attribute on the skb * @skb: socket buffer to reserve room on * @attrtype: attribute type * @attrlen: length of attribute payload * * Adds a netlink attribute header to a socket buffer and reserves * room for the payload but does not copy it. * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute header and payload. */ struct nlattr *__nla_reserve(struct sk_buff *skb, int attrtype, int attrlen) { struct nlattr *nla; nla = skb_put(skb, nla_total_size(attrlen)); nla->nla_type = attrtype; nla->nla_len = nla_attr_size(attrlen); memset((unsigned char *) nla + nla->nla_len, 0, nla_padlen(attrlen)); return nla; } EXPORT_SYMBOL(__nla_reserve); /** * __nla_reserve_64bit - reserve room for attribute on the skb and align it * @skb: socket buffer to reserve room on * @attrtype: attribute type * @attrlen: length of attribute payload * @padattr: attribute type for the padding * * Adds a netlink attribute header to a socket buffer and reserves * room for the payload but does not copy it. It also ensure that this * attribute will have a 64-bit aligned nla_data() area. * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute header and payload. */ struct nlattr *__nla_reserve_64bit(struct sk_buff *skb, int attrtype, int attrlen, int padattr) { nla_align_64bit(skb, padattr); return __nla_reserve(skb, attrtype, attrlen); } EXPORT_SYMBOL(__nla_reserve_64bit); /** * __nla_reserve_nohdr - reserve room for attribute without header * @skb: socket buffer to reserve room on * @attrlen: length of attribute payload * * Reserves room for attribute payload without a header. * * The caller is responsible to ensure that the skb provides enough * tailroom for the payload. */ void *__nla_reserve_nohdr(struct sk_buff *skb, int attrlen) { return skb_put_zero(skb, NLA_ALIGN(attrlen)); } EXPORT_SYMBOL(__nla_reserve_nohdr); /** * nla_reserve - reserve room for attribute on the skb * @skb: socket buffer to reserve room on * @attrtype: attribute type * @attrlen: length of attribute payload * * Adds a netlink attribute header to a socket buffer and reserves * room for the payload but does not copy it. * * Returns NULL if the tailroom of the skb is insufficient to store * the attribute header and payload. */ struct nlattr *nla_reserve(struct sk_buff *skb, int attrtype, int attrlen) { if (unlikely(skb_tailroom(skb) < nla_total_size(attrlen))) return NULL; return __nla_reserve(skb, attrtype, attrlen); } EXPORT_SYMBOL(nla_reserve); /** * nla_reserve_64bit - reserve room for attribute on the skb and align it * @skb: socket buffer to reserve room on * @attrtype: attribute type * @attrlen: length of attribute payload * @padattr: attribute type for the padding * * Adds a netlink attribute header to a socket buffer and reserves * room for the payload but does not copy it. It also ensure that this * attribute will have a 64-bit aligned nla_data() area. * * Returns NULL if the tailroom of the skb is insufficient to store * the attribute header and payload. */ struct nlattr *nla_reserve_64bit(struct sk_buff *skb, int attrtype, int attrlen, int padattr) { size_t len; if (nla_need_padding_for_64bit(skb)) len = nla_total_size_64bit(attrlen); else len = nla_total_size(attrlen); if (unlikely(skb_tailroom(skb) < len)) return NULL; return __nla_reserve_64bit(skb, attrtype, attrlen, padattr); } EXPORT_SYMBOL(nla_reserve_64bit); /** * nla_reserve_nohdr - reserve room for attribute without header * @skb: socket buffer to reserve room on * @attrlen: length of attribute payload * * Reserves room for attribute payload without a header. * * Returns NULL if the tailroom of the skb is insufficient to store * the attribute payload. */ void *nla_reserve_nohdr(struct sk_buff *skb, int attrlen) { if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen))) return NULL; return __nla_reserve_nohdr(skb, attrlen); } EXPORT_SYMBOL(nla_reserve_nohdr); /** * __nla_put - Add a netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @attrlen: length of attribute payload * @data: head of attribute payload * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute header and payload. */ void __nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data) { struct nlattr *nla; nla = __nla_reserve(skb, attrtype, attrlen); memcpy(nla_data(nla), data, attrlen); } EXPORT_SYMBOL(__nla_put); /** * __nla_put_64bit - Add a netlink attribute to a socket buffer and align it * @skb: socket buffer to add attribute to * @attrtype: attribute type * @attrlen: length of attribute payload * @data: head of attribute payload * @padattr: attribute type for the padding * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute header and payload. */ void __nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen, const void *data, int padattr) { struct nlattr *nla; nla = __nla_reserve_64bit(skb, attrtype, attrlen, padattr); memcpy(nla_data(nla), data, attrlen); } EXPORT_SYMBOL(__nla_put_64bit); /** * __nla_put_nohdr - Add a netlink attribute without header * @skb: socket buffer to add attribute to * @attrlen: length of attribute payload * @data: head of attribute payload * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute payload. */ void __nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data) { void *start; start = __nla_reserve_nohdr(skb, attrlen); memcpy(start, data, attrlen); } EXPORT_SYMBOL(__nla_put_nohdr); /** * nla_put - Add a netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @attrlen: length of attribute payload * @data: head of attribute payload * * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store * the attribute header and payload. */ int nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data) { if (unlikely(skb_tailroom(skb) < nla_total_size(attrlen))) return -EMSGSIZE; __nla_put(skb, attrtype, attrlen, data); return 0; } EXPORT_SYMBOL(nla_put); /** * nla_put_64bit - Add a netlink attribute to a socket buffer and align it * @skb: socket buffer to add attribute to * @attrtype: attribute type * @attrlen: length of attribute payload * @data: head of attribute payload * @padattr: attribute type for the padding * * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store * the attribute header and payload. */ int nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen, const void *data, int padattr) { size_t len; if (nla_need_padding_for_64bit(skb)) len = nla_total_size_64bit(attrlen); else len = nla_total_size(attrlen); if (unlikely(skb_tailroom(skb) < len)) return -EMSGSIZE; __nla_put_64bit(skb, attrtype, attrlen, data, padattr); return 0; } EXPORT_SYMBOL(nla_put_64bit); /** * nla_put_nohdr - Add a netlink attribute without header * @skb: socket buffer to add attribute to * @attrlen: length of attribute payload * @data: head of attribute payload * * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store * the attribute payload. */ int nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data) { if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen))) return -EMSGSIZE; __nla_put_nohdr(skb, attrlen, data); return 0; } EXPORT_SYMBOL(nla_put_nohdr); /** * nla_append - Add a netlink attribute without header or padding * @skb: socket buffer to add attribute to * @attrlen: length of attribute payload * @data: head of attribute payload * * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store * the attribute payload. */ int nla_append(struct sk_buff *skb, int attrlen, const void *data) { if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen))) return -EMSGSIZE; skb_put_data(skb, data, attrlen); return 0; } EXPORT_SYMBOL(nla_append); #endif
82 25 22 336 284 12 27 99 37 175 455 102 2 97 92 92 278 1 16 15 31 29 6 28 22 42 284 29 94 3 2 22 28 4 25 111 124 460 1 1 8 7 5 1 336 3 1 1 1 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_VIRTIO_NET_H #define _LINUX_VIRTIO_NET_H #include <linux/if_vlan.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/udp.h> #include <uapi/linux/tcp.h> #include <uapi/linux/virtio_net.h> static inline bool virtio_net_hdr_match_proto(__be16 protocol, __u8 gso_type) { switch (gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { case VIRTIO_NET_HDR_GSO_TCPV4: return protocol == cpu_to_be16(ETH_P_IP); case VIRTIO_NET_HDR_GSO_TCPV6: return protocol == cpu_to_be16(ETH_P_IPV6); case VIRTIO_NET_HDR_GSO_UDP: case VIRTIO_NET_HDR_GSO_UDP_L4: return protocol == cpu_to_be16(ETH_P_IP) || protocol == cpu_to_be16(ETH_P_IPV6); default: return false; } } static inline int virtio_net_hdr_set_proto(struct sk_buff *skb, const struct virtio_net_hdr *hdr) { if (skb->protocol) return 0; switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { case VIRTIO_NET_HDR_GSO_TCPV4: case VIRTIO_NET_HDR_GSO_UDP: case VIRTIO_NET_HDR_GSO_UDP_L4: skb->protocol = cpu_to_be16(ETH_P_IP); break; case VIRTIO_NET_HDR_GSO_TCPV6: skb->protocol = cpu_to_be16(ETH_P_IPV6); break; default: return -EINVAL; } return 0; } static inline int __virtio_net_hdr_to_skb(struct sk_buff *skb, const struct virtio_net_hdr *hdr, bool little_endian, u8 hdr_gso_type) { unsigned int nh_min_len = sizeof(struct iphdr); unsigned int gso_type = 0; unsigned int thlen = 0; unsigned int p_off = 0; unsigned int ip_proto; if (hdr_gso_type != VIRTIO_NET_HDR_GSO_NONE) { switch (hdr_gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { case VIRTIO_NET_HDR_GSO_TCPV4: gso_type = SKB_GSO_TCPV4; ip_proto = IPPROTO_TCP; thlen = sizeof(struct tcphdr); break; case VIRTIO_NET_HDR_GSO_TCPV6: gso_type = SKB_GSO_TCPV6; ip_proto = IPPROTO_TCP; thlen = sizeof(struct tcphdr); nh_min_len = sizeof(struct ipv6hdr); break; case VIRTIO_NET_HDR_GSO_UDP: gso_type = SKB_GSO_UDP; ip_proto = IPPROTO_UDP; thlen = sizeof(struct udphdr); break; case VIRTIO_NET_HDR_GSO_UDP_L4: gso_type = SKB_GSO_UDP_L4; ip_proto = IPPROTO_UDP; thlen = sizeof(struct udphdr); break; default: return -EINVAL; } if (hdr_gso_type & VIRTIO_NET_HDR_GSO_ECN) gso_type |= SKB_GSO_TCP_ECN; if (hdr->gso_size == 0) return -EINVAL; } skb_reset_mac_header(skb); if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { u32 start = __virtio16_to_cpu(little_endian, hdr->csum_start); u32 off = __virtio16_to_cpu(little_endian, hdr->csum_offset); u32 needed = start + max_t(u32, thlen, off + sizeof(__sum16)); if (!pskb_may_pull(skb, needed)) return -EINVAL; if (!skb_partial_csum_set(skb, start, off)) return -EINVAL; if (skb_transport_offset(skb) < nh_min_len) return -EINVAL; nh_min_len = skb_transport_offset(skb); p_off = nh_min_len + thlen; if (!pskb_may_pull(skb, p_off)) return -EINVAL; } else { /* gso packets without NEEDS_CSUM do not set transport_offset. * probe and drop if does not match one of the above types. */ if (gso_type && skb->network_header) { struct flow_keys_basic keys; if (!skb->protocol) { __be16 protocol = dev_parse_header_protocol(skb); if (!protocol) virtio_net_hdr_set_proto(skb, hdr); else if (!virtio_net_hdr_match_proto(protocol, hdr_gso_type)) return -EINVAL; else skb->protocol = protocol; } retry: if (!skb_flow_dissect_flow_keys_basic(NULL, skb, &keys, NULL, 0, 0, 0, 0)) { /* UFO does not specify ipv4 or 6: try both */ if (gso_type & SKB_GSO_UDP && skb->protocol == htons(ETH_P_IP)) { skb->protocol = htons(ETH_P_IPV6); goto retry; } return -EINVAL; } p_off = keys.control.thoff + thlen; if (!pskb_may_pull(skb, p_off) || keys.basic.ip_proto != ip_proto) return -EINVAL; skb_set_transport_header(skb, keys.control.thoff); } else if (gso_type) { p_off = nh_min_len + thlen; if (!pskb_may_pull(skb, p_off)) return -EINVAL; } } if (hdr_gso_type != VIRTIO_NET_HDR_GSO_NONE) { u16 gso_size = __virtio16_to_cpu(little_endian, hdr->gso_size); unsigned int nh_off = p_off; struct skb_shared_info *shinfo = skb_shinfo(skb); switch (gso_type & ~SKB_GSO_TCP_ECN) { case SKB_GSO_UDP: /* UFO may not include transport header in gso_size. */ nh_off -= thlen; break; case SKB_GSO_UDP_L4: if (!(hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) return -EINVAL; if (skb->csum_offset != offsetof(struct udphdr, check)) return -EINVAL; if (skb->len - p_off > gso_size * UDP_MAX_SEGMENTS) return -EINVAL; if (gso_type != SKB_GSO_UDP_L4) return -EINVAL; break; case SKB_GSO_TCPV4: case SKB_GSO_TCPV6: if (skb->ip_summed == CHECKSUM_PARTIAL && skb->csum_offset != offsetof(struct tcphdr, check)) return -EINVAL; break; } /* Kernel has a special handling for GSO_BY_FRAGS. */ if (gso_size == GSO_BY_FRAGS) return -EINVAL; /* Too small packets are not really GSO ones. */ if (skb->len - nh_off > gso_size) { shinfo->gso_size = gso_size; shinfo->gso_type = gso_type; /* Header must be checked, and gso_segs computed. */ shinfo->gso_type |= SKB_GSO_DODGY; shinfo->gso_segs = 0; } } return 0; } static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, const struct virtio_net_hdr *hdr, bool little_endian) { return __virtio_net_hdr_to_skb(skb, hdr, little_endian, hdr->gso_type); } static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb, struct virtio_net_hdr *hdr, bool little_endian, bool has_data_valid, int vlan_hlen) { memset(hdr, 0, sizeof(*hdr)); /* no info leak */ if (skb_is_gso(skb)) { struct skb_shared_info *sinfo = skb_shinfo(skb); /* This is a hint as to how much should be linear. */ hdr->hdr_len = __cpu_to_virtio16(little_endian, skb_headlen(skb)); hdr->gso_size = __cpu_to_virtio16(little_endian, sinfo->gso_size); if (sinfo->gso_type & SKB_GSO_TCPV4) hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4; else if (sinfo->gso_type & SKB_GSO_TCPV6) hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6; else if (sinfo->gso_type & SKB_GSO_UDP_L4) hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP_L4; else return -EINVAL; if (sinfo->gso_type & SKB_GSO_TCP_ECN) hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN; } else hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE; if (skb->ip_summed == CHECKSUM_PARTIAL) { hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; hdr->csum_start = __cpu_to_virtio16(little_endian, skb_checksum_start_offset(skb) + vlan_hlen); hdr->csum_offset = __cpu_to_virtio16(little_endian, skb->csum_offset); } else if (has_data_valid && skb->ip_summed == CHECKSUM_UNNECESSARY) { hdr->flags = VIRTIO_NET_HDR_F_DATA_VALID; } /* else everything is zero */ return 0; } static inline unsigned int virtio_l3min(bool is_ipv6) { return is_ipv6 ? sizeof(struct ipv6hdr) : sizeof(struct iphdr); } static inline int virtio_net_hdr_tnl_to_skb(struct sk_buff *skb, const struct virtio_net_hdr_v1_hash_tunnel *vhdr, bool tnl_hdr_negotiated, bool tnl_csum_negotiated, bool little_endian) { const struct virtio_net_hdr *hdr = (const struct virtio_net_hdr *)vhdr; unsigned int inner_nh, outer_th, inner_th; unsigned int inner_l3min, outer_l3min; u8 gso_inner_type, gso_tunnel_type; bool outer_isv6, inner_isv6; int ret; gso_tunnel_type = hdr->gso_type & VIRTIO_NET_HDR_GSO_UDP_TUNNEL; if (!gso_tunnel_type) return virtio_net_hdr_to_skb(skb, hdr, little_endian); /* Tunnel not supported/negotiated, but the hdr asks for it. */ if (!tnl_hdr_negotiated) return -EINVAL; /* Either ipv4 or ipv6. */ if (gso_tunnel_type == VIRTIO_NET_HDR_GSO_UDP_TUNNEL) return -EINVAL; /* The UDP tunnel must carry a GSO packet, but no UFO. */ gso_inner_type = hdr->gso_type & ~(VIRTIO_NET_HDR_GSO_ECN | VIRTIO_NET_HDR_GSO_UDP_TUNNEL); if (!gso_inner_type || gso_inner_type == VIRTIO_NET_HDR_GSO_UDP) return -EINVAL; /* Rely on csum being present. */ if (!(hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) return -EINVAL; /* Validate offsets. */ outer_isv6 = gso_tunnel_type & VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV6; inner_isv6 = gso_inner_type == VIRTIO_NET_HDR_GSO_TCPV6; inner_l3min = virtio_l3min(inner_isv6); outer_l3min = ETH_HLEN + virtio_l3min(outer_isv6); inner_th = __virtio16_to_cpu(little_endian, hdr->csum_start); inner_nh = le16_to_cpu(vhdr->inner_nh_offset); outer_th = le16_to_cpu(vhdr->outer_th_offset); if (outer_th < outer_l3min || inner_nh < outer_th + sizeof(struct udphdr) || inner_th < inner_nh + inner_l3min) return -EINVAL; /* Let the basic parsing deal with plain GSO features. */ ret = __virtio_net_hdr_to_skb(skb, hdr, true, hdr->gso_type & ~gso_tunnel_type); if (ret) return ret; /* In case of USO, the inner protocol is still unknown and * `inner_isv6` is just a guess, additional parsing is needed. * The previous validation ensures that accessing an ipv4 inner * network header is safe. */ if (gso_inner_type == VIRTIO_NET_HDR_GSO_UDP_L4) { struct iphdr *iphdr = (struct iphdr *)(skb->data + inner_nh); inner_isv6 = iphdr->version == 6; inner_l3min = virtio_l3min(inner_isv6); if (inner_th < inner_nh + inner_l3min) return -EINVAL; } skb_set_inner_protocol(skb, inner_isv6 ? htons(ETH_P_IPV6) : htons(ETH_P_IP)); if (hdr->flags & VIRTIO_NET_HDR_F_UDP_TUNNEL_CSUM) { if (!tnl_csum_negotiated) return -EINVAL; skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM; } else { skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL; } skb->inner_transport_header = inner_th + skb_headroom(skb); skb->inner_network_header = inner_nh + skb_headroom(skb); skb->inner_mac_header = inner_nh + skb_headroom(skb); skb->transport_header = outer_th + skb_headroom(skb); skb->encapsulation = 1; return 0; } /* Checksum-related fields validation for the driver */ static inline int virtio_net_handle_csum_offload(struct sk_buff *skb, struct virtio_net_hdr *hdr, bool tnl_csum_negotiated) { if (!(hdr->gso_type & VIRTIO_NET_HDR_GSO_UDP_TUNNEL)) { if (!(hdr->flags & VIRTIO_NET_HDR_F_DATA_VALID)) return 0; skb->ip_summed = CHECKSUM_UNNECESSARY; if (!(hdr->flags & VIRTIO_NET_HDR_F_UDP_TUNNEL_CSUM)) return 0; /* tunnel csum packets are invalid when the related * feature has not been negotiated */ if (!tnl_csum_negotiated) return -EINVAL; skb->csum_level = 1; return 0; } /* DATA_VALID is mutually exclusive with NEEDS_CSUM, and GSO * over UDP tunnel requires the latter */ if (hdr->flags & VIRTIO_NET_HDR_F_DATA_VALID) return -EINVAL; return 0; } /* * vlan_hlen always refers to the outermost MAC header. That also * means it refers to the only MAC header, if the packet does not carry * any encapsulation. */ static inline int virtio_net_hdr_tnl_from_skb(const struct sk_buff *skb, struct virtio_net_hdr_v1_hash_tunnel *vhdr, bool tnl_hdr_negotiated, bool little_endian, int vlan_hlen, bool has_data_valid) { struct virtio_net_hdr *hdr = (struct virtio_net_hdr *)vhdr; unsigned int inner_nh, outer_th; int tnl_gso_type; int ret; tnl_gso_type = skb_shinfo(skb)->gso_type & (SKB_GSO_UDP_TUNNEL | SKB_GSO_UDP_TUNNEL_CSUM); if (!tnl_gso_type) return virtio_net_hdr_from_skb(skb, hdr, little_endian, has_data_valid, vlan_hlen); /* Tunnel support not negotiated but skb ask for it. */ if (!tnl_hdr_negotiated) return -EINVAL; vhdr->hash_hdr.hash_value_lo = 0; vhdr->hash_hdr.hash_value_hi = 0; vhdr->hash_hdr.hash_report = 0; vhdr->hash_hdr.padding = 0; /* Let the basic parsing deal with plain GSO features. */ skb_shinfo(skb)->gso_type &= ~tnl_gso_type; ret = virtio_net_hdr_from_skb(skb, hdr, true, false, vlan_hlen); skb_shinfo(skb)->gso_type |= tnl_gso_type; if (ret) return ret; if (skb->protocol == htons(ETH_P_IPV6)) hdr->gso_type |= VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV6; else hdr->gso_type |= VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV4; if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM) hdr->flags |= VIRTIO_NET_HDR_F_UDP_TUNNEL_CSUM; inner_nh = skb->inner_network_header - skb_headroom(skb); outer_th = skb->transport_header - skb_headroom(skb); vhdr->inner_nh_offset = cpu_to_le16(inner_nh); vhdr->outer_th_offset = cpu_to_le16(outer_th); return 0; } #endif /* _LINUX_VIRTIO_NET_H */
624 625 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 // SPDX-License-Identifier: GPL-2.0 /* * Device physical location support * * Author: Won Chung <wonchung@google.com> */ #include <linux/acpi.h> #include <linux/sysfs.h> #include <linux/string_choices.h> #include "physical_location.h" bool dev_add_physical_location(struct device *dev) { struct acpi_pld_info *pld; if (!has_acpi_companion(dev)) return false; if (!acpi_get_physical_device_location(ACPI_HANDLE(dev), &pld)) return false; dev->physical_location = kzalloc(sizeof(*dev->physical_location), GFP_KERNEL); if (!dev->physical_location) { ACPI_FREE(pld); return false; } dev->physical_location->panel = pld->panel; dev->physical_location->vertical_position = pld->vertical_position; dev->physical_location->horizontal_position = pld->horizontal_position; dev->physical_location->dock = pld->dock; dev->physical_location->lid = pld->lid; ACPI_FREE(pld); return true; } static ssize_t panel_show(struct device *dev, struct device_attribute *attr, char *buf) { const char *panel; switch (dev->physical_location->panel) { case DEVICE_PANEL_TOP: panel = "top"; break; case DEVICE_PANEL_BOTTOM: panel = "bottom"; break; case DEVICE_PANEL_LEFT: panel = "left"; break; case DEVICE_PANEL_RIGHT: panel = "right"; break; case DEVICE_PANEL_FRONT: panel = "front"; break; case DEVICE_PANEL_BACK: panel = "back"; break; default: panel = "unknown"; } return sysfs_emit(buf, "%s\n", panel); } static DEVICE_ATTR_RO(panel); static ssize_t vertical_position_show(struct device *dev, struct device_attribute *attr, char *buf) { const char *vertical_position; switch (dev->physical_location->vertical_position) { case DEVICE_VERT_POS_UPPER: vertical_position = "upper"; break; case DEVICE_VERT_POS_CENTER: vertical_position = "center"; break; case DEVICE_VERT_POS_LOWER: vertical_position = "lower"; break; default: vertical_position = "unknown"; } return sysfs_emit(buf, "%s\n", vertical_position); } static DEVICE_ATTR_RO(vertical_position); static ssize_t horizontal_position_show(struct device *dev, struct device_attribute *attr, char *buf) { const char *horizontal_position; switch (dev->physical_location->horizontal_position) { case DEVICE_HORI_POS_LEFT: horizontal_position = "left"; break; case DEVICE_HORI_POS_CENTER: horizontal_position = "center"; break; case DEVICE_HORI_POS_RIGHT: horizontal_position = "right"; break; default: horizontal_position = "unknown"; } return sysfs_emit(buf, "%s\n", horizontal_position); } static DEVICE_ATTR_RO(horizontal_position); static ssize_t dock_show(struct device *dev, struct device_attribute *attr, char *buf) { return sysfs_emit(buf, "%s\n", str_yes_no(dev->physical_location->dock)); } static DEVICE_ATTR_RO(dock); static ssize_t lid_show(struct device *dev, struct device_attribute *attr, char *buf) { return sysfs_emit(buf, "%s\n", str_yes_no(dev->physical_location->lid)); } static DEVICE_ATTR_RO(lid); static struct attribute *dev_attr_physical_location[] = { &dev_attr_panel.attr, &dev_attr_vertical_position.attr, &dev_attr_horizontal_position.attr, &dev_attr_dock.attr, &dev_attr_lid.attr, NULL, }; const struct attribute_group dev_attr_physical_location_group = { .name = "physical_location", .attrs = dev_attr_physical_location, };
33 34 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 // SPDX-License-Identifier: GPL-2.0-only /* * linux/net/sunrpc/sunrpc_syms.c * * Symbols exported by the sunrpc module. * * Copyright (C) 1997 Olaf Kirch <okir@monad.swb.de> */ #include <linux/module.h> #include <linux/types.h> #include <linux/uio.h> #include <linux/unistd.h> #include <linux/init.h> #include <linux/sunrpc/sched.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/svc.h> #include <linux/sunrpc/svcsock.h> #include <linux/sunrpc/auth.h> #include <linux/workqueue.h> #include <linux/sunrpc/rpc_pipe_fs.h> #include <linux/sunrpc/xprtsock.h> #include "sunrpc.h" #include "sysfs.h" #include "netns.h" unsigned int sunrpc_net_id; EXPORT_SYMBOL_GPL(sunrpc_net_id); static __net_init int sunrpc_init_net(struct net *net) { int err; struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); err = rpc_proc_init(net); if (err) goto err_proc; err = ip_map_cache_create(net); if (err) goto err_ipmap; err = unix_gid_cache_create(net); if (err) goto err_unixgid; err = rpc_pipefs_init_net(net); if (err) goto err_pipefs; INIT_LIST_HEAD(&sn->all_clients); spin_lock_init(&sn->rpc_client_lock); spin_lock_init(&sn->rpcb_clnt_lock); return 0; err_pipefs: unix_gid_cache_destroy(net); err_unixgid: ip_map_cache_destroy(net); err_ipmap: rpc_proc_exit(net); err_proc: return err; } static __net_exit void sunrpc_exit_net(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); rpc_pipefs_exit_net(net); unix_gid_cache_destroy(net); ip_map_cache_destroy(net); rpc_proc_exit(net); WARN_ON_ONCE(!list_empty(&sn->all_clients)); } static struct pernet_operations sunrpc_net_ops = { .init = sunrpc_init_net, .exit = sunrpc_exit_net, .id = &sunrpc_net_id, .size = sizeof(struct sunrpc_net), }; static int __init init_sunrpc(void) { int err = rpc_init_mempool(); if (err) goto out; err = rpcauth_init_module(); if (err) goto out2; cache_initialize(); err = register_pernet_subsys(&sunrpc_net_ops); if (err) goto out3; err = register_rpc_pipefs(); if (err) goto out4; err = rpc_sysfs_init(); if (err) goto out5; sunrpc_debugfs_init(); #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) rpc_register_sysctl(); #endif svc_init_xprt_sock(); /* svc sock transport */ init_socket_xprt(); /* clnt sock transport */ return 0; out5: unregister_rpc_pipefs(); out4: unregister_pernet_subsys(&sunrpc_net_ops); out3: rpcauth_remove_module(); out2: rpc_destroy_mempool(); out: return err; } static void __exit cleanup_sunrpc(void) { rpc_sysfs_exit(); rpc_cleanup_clids(); xprt_cleanup_ids(); xprt_multipath_cleanup_ids(); rpcauth_remove_module(); cleanup_socket_xprt(); svc_cleanup_xprt_sock(); sunrpc_debugfs_exit(); unregister_rpc_pipefs(); rpc_destroy_mempool(); unregister_pernet_subsys(&sunrpc_net_ops); auth_domain_cleanup(); #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) rpc_unregister_sysctl(); #endif rcu_barrier(); /* Wait for completion of call_rcu()'s */ } MODULE_DESCRIPTION("Sun RPC core"); MODULE_LICENSE("GPL"); fs_initcall(init_sunrpc); /* Ensure we're initialised before nfs */ module_exit(cleanup_sunrpc);
6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_TC_CSUM_H #define __NET_TC_CSUM_H #include <linux/types.h> #include <net/act_api.h> #include <linux/tc_act/tc_csum.h> struct tcf_csum_params { u32 update_flags; int action; struct rcu_head rcu; }; struct tcf_csum { struct tc_action common; struct tcf_csum_params __rcu *params; }; #define to_tcf_csum(a) ((struct tcf_csum *)a) static inline u32 tcf_csum_update_flags(const struct tc_action *a) { u32 update_flags; rcu_read_lock(); update_flags = rcu_dereference(to_tcf_csum(a)->params)->update_flags; rcu_read_unlock(); return update_flags; } #endif /* __NET_TC_CSUM_H */
4461 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * SHA-256 optimized for x86_64 * * Copyright 2025 Google LLC */ #include <asm/fpu/api.h> #include <linux/static_call.h> static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha_ni); DEFINE_STATIC_CALL(sha256_blocks_x86, sha256_blocks_generic); #define DEFINE_X86_SHA256_FN(c_fn, asm_fn) \ asmlinkage void asm_fn(struct sha256_block_state *state, \ const u8 *data, size_t nblocks); \ static void c_fn(struct sha256_block_state *state, const u8 *data, \ size_t nblocks) \ { \ if (likely(irq_fpu_usable())) { \ kernel_fpu_begin(); \ asm_fn(state, data, nblocks); \ kernel_fpu_end(); \ } else { \ sha256_blocks_generic(state, data, nblocks); \ } \ } DEFINE_X86_SHA256_FN(sha256_blocks_ssse3, sha256_transform_ssse3); DEFINE_X86_SHA256_FN(sha256_blocks_avx, sha256_transform_avx); DEFINE_X86_SHA256_FN(sha256_blocks_avx2, sha256_transform_rorx); DEFINE_X86_SHA256_FN(sha256_blocks_ni, sha256_ni_transform); static void sha256_blocks(struct sha256_block_state *state, const u8 *data, size_t nblocks) { static_call(sha256_blocks_x86)(state, data, nblocks); } static_assert(offsetof(struct __sha256_ctx, state) == 0); static_assert(offsetof(struct __sha256_ctx, bytecount) == 32); static_assert(offsetof(struct __sha256_ctx, buf) == 40); asmlinkage void sha256_ni_finup2x(const struct __sha256_ctx *ctx, const u8 *data1, const u8 *data2, int len, u8 out1[SHA256_DIGEST_SIZE], u8 out2[SHA256_DIGEST_SIZE]); #define sha256_finup_2x_arch sha256_finup_2x_arch static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx, const u8 *data1, const u8 *data2, size_t len, u8 out1[SHA256_DIGEST_SIZE], u8 out2[SHA256_DIGEST_SIZE]) { /* * The assembly requires len >= SHA256_BLOCK_SIZE && len <= INT_MAX. * Further limit len to 65536 to avoid spending too long with preemption * disabled. (Of course, in practice len is nearly always 4096 anyway.) */ if (static_branch_likely(&have_sha_ni) && len >= SHA256_BLOCK_SIZE && len <= 65536 && likely(irq_fpu_usable())) { kernel_fpu_begin(); sha256_ni_finup2x(ctx, data1, data2, len, out1, out2); kernel_fpu_end(); kmsan_unpoison_memory(out1, SHA256_DIGEST_SIZE); kmsan_unpoison_memory(out2, SHA256_DIGEST_SIZE); return true; } return false; } static bool sha256_finup_2x_is_optimized_arch(void) { return static_key_enabled(&have_sha_ni); } #define sha256_mod_init_arch sha256_mod_init_arch static void sha256_mod_init_arch(void) { if (boot_cpu_has(X86_FEATURE_SHA_NI)) { static_call_update(sha256_blocks_x86, sha256_blocks_ni); static_branch_enable(&have_sha_ni); } else if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL) && boot_cpu_has(X86_FEATURE_AVX)) { if (boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_BMI2)) static_call_update(sha256_blocks_x86, sha256_blocks_avx2); else static_call_update(sha256_blocks_x86, sha256_blocks_avx); } else if (boot_cpu_has(X86_FEATURE_SSSE3)) { static_call_update(sha256_blocks_x86, sha256_blocks_ssse3); } }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 /* SPDX-License-Identifier: GPL-2.0 */ /* * Events for filesystem locks * * Copyright 2013 Jeff Layton <jlayton@poochiereds.net> */ #undef TRACE_SYSTEM #define TRACE_SYSTEM filelock #if !defined(_TRACE_FILELOCK_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_FILELOCK_H #include <linux/tracepoint.h> #include <linux/fs.h> #include <linux/device.h> #include <linux/kdev_t.h> #define show_fl_flags(val) \ __print_flags(val, "|", \ { FL_POSIX, "FL_POSIX" }, \ { FL_FLOCK, "FL_FLOCK" }, \ { FL_DELEG, "FL_DELEG" }, \ { FL_ACCESS, "FL_ACCESS" }, \ { FL_EXISTS, "FL_EXISTS" }, \ { FL_LEASE, "FL_LEASE" }, \ { FL_CLOSE, "FL_CLOSE" }, \ { FL_SLEEP, "FL_SLEEP" }, \ { FL_DOWNGRADE_PENDING, "FL_DOWNGRADE_PENDING" }, \ { FL_UNLOCK_PENDING, "FL_UNLOCK_PENDING" }, \ { FL_OFDLCK, "FL_OFDLCK" }, \ { FL_RECLAIM, "FL_RECLAIM"}) #define show_fl_type(val) \ __print_symbolic(val, \ { F_RDLCK, "F_RDLCK" }, \ { F_WRLCK, "F_WRLCK" }, \ { F_UNLCK, "F_UNLCK" }) TRACE_EVENT(locks_get_lock_context, TP_PROTO(struct inode *inode, int type, struct file_lock_context *ctx), TP_ARGS(inode, type, ctx), TP_STRUCT__entry( __field(unsigned long, i_ino) __field(dev_t, s_dev) __field(unsigned char, type) __field(struct file_lock_context *, ctx) ), TP_fast_assign( __entry->s_dev = inode->i_sb->s_dev; __entry->i_ino = inode->i_ino; __entry->type = type; __entry->ctx = ctx; ), TP_printk("dev=0x%x:0x%x ino=0x%lx type=%s ctx=%p", MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, show_fl_type(__entry->type), __entry->ctx) ); DECLARE_EVENT_CLASS(filelock_lock, TP_PROTO(struct inode *inode, struct file_lock *fl, int ret), TP_ARGS(inode, fl, ret), TP_STRUCT__entry( __field(struct file_lock *, fl) __field(unsigned long, i_ino) __field(dev_t, s_dev) __field(struct file_lock_core *, blocker) __field(fl_owner_t, owner) __field(unsigned int, pid) __field(unsigned int, flags) __field(unsigned char, type) __field(loff_t, fl_start) __field(loff_t, fl_end) __field(int, ret) ), TP_fast_assign( __entry->fl = fl ? fl : NULL; __entry->s_dev = inode->i_sb->s_dev; __entry->i_ino = inode->i_ino; __entry->blocker = fl ? fl->c.flc_blocker : NULL; __entry->owner = fl ? fl->c.flc_owner : NULL; __entry->pid = fl ? fl->c.flc_pid : 0; __entry->flags = fl ? fl->c.flc_flags : 0; __entry->type = fl ? fl->c.flc_type : 0; __entry->fl_start = fl ? fl->fl_start : 0; __entry->fl_end = fl ? fl->fl_end : 0; __entry->ret = ret; ), TP_printk("fl=%p dev=0x%x:0x%x ino=0x%lx fl_blocker=%p fl_owner=%p fl_pid=%u fl_flags=%s fl_type=%s fl_start=%lld fl_end=%lld ret=%d", __entry->fl, MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, __entry->blocker, __entry->owner, __entry->pid, show_fl_flags(__entry->flags), show_fl_type(__entry->type), __entry->fl_start, __entry->fl_end, __entry->ret) ); DEFINE_EVENT(filelock_lock, posix_lock_inode, TP_PROTO(struct inode *inode, struct file_lock *fl, int ret), TP_ARGS(inode, fl, ret)); DEFINE_EVENT(filelock_lock, fcntl_setlk, TP_PROTO(struct inode *inode, struct file_lock *fl, int ret), TP_ARGS(inode, fl, ret)); DEFINE_EVENT(filelock_lock, locks_remove_posix, TP_PROTO(struct inode *inode, struct file_lock *fl, int ret), TP_ARGS(inode, fl, ret)); DEFINE_EVENT(filelock_lock, flock_lock_inode, TP_PROTO(struct inode *inode, struct file_lock *fl, int ret), TP_ARGS(inode, fl, ret)); DECLARE_EVENT_CLASS(filelock_lease, TP_PROTO(struct inode *inode, struct file_lease *fl), TP_ARGS(inode, fl), TP_STRUCT__entry( __field(struct file_lease *, fl) __field(unsigned long, i_ino) __field(dev_t, s_dev) __field(struct file_lock_core *, blocker) __field(fl_owner_t, owner) __field(unsigned int, flags) __field(unsigned char, type) __field(unsigned long, break_time) __field(unsigned long, downgrade_time) ), TP_fast_assign( __entry->fl = fl ? fl : NULL; __entry->s_dev = inode->i_sb->s_dev; __entry->i_ino = inode->i_ino; __entry->blocker = fl ? fl->c.flc_blocker : NULL; __entry->owner = fl ? fl->c.flc_owner : NULL; __entry->flags = fl ? fl->c.flc_flags : 0; __entry->type = fl ? fl->c.flc_type : 0; __entry->break_time = fl ? fl->fl_break_time : 0; __entry->downgrade_time = fl ? fl->fl_downgrade_time : 0; ), TP_printk("fl=%p dev=0x%x:0x%x ino=0x%lx fl_blocker=%p fl_owner=%p fl_flags=%s fl_type=%s fl_break_time=%lu fl_downgrade_time=%lu", __entry->fl, MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, __entry->blocker, __entry->owner, show_fl_flags(__entry->flags), show_fl_type(__entry->type), __entry->break_time, __entry->downgrade_time) ); DEFINE_EVENT(filelock_lease, break_lease_noblock, TP_PROTO(struct inode *inode, struct file_lease *fl), TP_ARGS(inode, fl)); DEFINE_EVENT(filelock_lease, break_lease_block, TP_PROTO(struct inode *inode, struct file_lease *fl), TP_ARGS(inode, fl)); DEFINE_EVENT(filelock_lease, break_lease_unblock, TP_PROTO(struct inode *inode, struct file_lease *fl), TP_ARGS(inode, fl)); DEFINE_EVENT(filelock_lease, generic_delete_lease, TP_PROTO(struct inode *inode, struct file_lease *fl), TP_ARGS(inode, fl)); DEFINE_EVENT(filelock_lease, time_out_leases, TP_PROTO(struct inode *inode, struct file_lease *fl), TP_ARGS(inode, fl)); TRACE_EVENT(generic_add_lease, TP_PROTO(struct inode *inode, struct file_lease *fl), TP_ARGS(inode, fl), TP_STRUCT__entry( __field(unsigned long, i_ino) __field(int, wcount) __field(int, rcount) __field(int, icount) __field(dev_t, s_dev) __field(fl_owner_t, owner) __field(unsigned int, flags) __field(unsigned char, type) ), TP_fast_assign( __entry->s_dev = inode->i_sb->s_dev; __entry->i_ino = inode->i_ino; __entry->wcount = atomic_read(&inode->i_writecount); __entry->rcount = atomic_read(&inode->i_readcount); __entry->icount = icount_read(inode); __entry->owner = fl->c.flc_owner; __entry->flags = fl->c.flc_flags; __entry->type = fl->c.flc_type; ), TP_printk("dev=0x%x:0x%x ino=0x%lx wcount=%d rcount=%d icount=%d fl_owner=%p fl_flags=%s fl_type=%s", MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, __entry->wcount, __entry->rcount, __entry->icount, __entry->owner, show_fl_flags(__entry->flags), show_fl_type(__entry->type)) ); TRACE_EVENT(leases_conflict, TP_PROTO(bool conflict, struct file_lease *lease, struct file_lease *breaker), TP_ARGS(conflict, lease, breaker), TP_STRUCT__entry( __field(void *, lease) __field(void *, breaker) __field(unsigned int, l_fl_flags) __field(unsigned int, b_fl_flags) __field(unsigned char, l_fl_type) __field(unsigned char, b_fl_type) __field(bool, conflict) ), TP_fast_assign( __entry->lease = lease; __entry->l_fl_flags = lease->c.flc_flags; __entry->l_fl_type = lease->c.flc_type; __entry->breaker = breaker; __entry->b_fl_flags = breaker->c.flc_flags; __entry->b_fl_type = breaker->c.flc_type; __entry->conflict = conflict; ), TP_printk("conflict %d: lease=%p fl_flags=%s fl_type=%s; breaker=%p fl_flags=%s fl_type=%s", __entry->conflict, __entry->lease, show_fl_flags(__entry->l_fl_flags), show_fl_type(__entry->l_fl_type), __entry->breaker, show_fl_flags(__entry->b_fl_flags), show_fl_type(__entry->b_fl_type)) ); #endif /* _TRACE_FILELOCK_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
44 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __ASM_GENERIC_CHECKSUM_H #define __ASM_GENERIC_CHECKSUM_H #include <linux/bitops.h> /* * computes the checksum of a memory block at buff, length len, * and adds in "sum" (32-bit) * * returns a 32-bit number suitable for feeding into itself * or csum_tcpudp_magic * * this function must be called with even lengths, except * for the last fragment, which may be odd * * it's best to have buff aligned on a 32-bit boundary */ extern __wsum csum_partial(const void *buff, int len, __wsum sum); #ifndef ip_fast_csum /* * This is a version of ip_compute_csum() optimized for IP headers, * which always checksum on 4 octet boundaries. */ extern __sum16 ip_fast_csum(const void *iph, unsigned int ihl); #endif #ifndef csum_fold /* * Fold a partial checksum */ static inline __sum16 csum_fold(__wsum csum) { u32 sum = (__force u32)csum; return (__force __sum16)((~sum - ror32(sum, 16)) >> 16); } #endif #ifndef csum_tcpudp_nofold /* * computes the checksum of the TCP/UDP pseudo-header * returns a 16-bit checksum, already complemented */ extern __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __wsum sum); #endif #ifndef csum_tcpudp_magic static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __wsum sum) { return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum)); } #endif /* * this routine is used for miscellaneous IP-like checksums, mainly * in icmp.c */ extern __sum16 ip_compute_csum(const void *buff, int len); #endif /* __ASM_GENERIC_CHECKSUM_H */
1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 /* * Copyright (c) 2004, 2005 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2004, 2005 Infinicon Corporation. All rights reserved. * Copyright (c) 2004, 2005 Intel Corporation. All rights reserved. * Copyright (c) 2004, 2005 Topspin Corporation. All rights reserved. * Copyright (c) 2004-2007 Voltaire Corporation. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/slab.h> #include <linux/string.h> #include "agent.h" #include "smi.h" #include "mad_priv.h" #define SPFX "ib_agent: " struct ib_agent_port_private { struct list_head port_list; struct ib_mad_agent *agent[2]; }; static DEFINE_SPINLOCK(ib_agent_port_list_lock); static LIST_HEAD(ib_agent_port_list); static struct ib_agent_port_private * __ib_get_agent_port(const struct ib_device *device, int port_num) { struct ib_agent_port_private *entry; list_for_each_entry(entry, &ib_agent_port_list, port_list) { /* Need to check both agent[0] and agent[1], as an agent port * may only have one of them */ if (entry->agent[0] && entry->agent[0]->device == device && entry->agent[0]->port_num == port_num) return entry; if (entry->agent[1] && entry->agent[1]->device == device && entry->agent[1]->port_num == port_num) return entry; } return NULL; } static struct ib_agent_port_private * ib_get_agent_port(const struct ib_device *device, int port_num) { struct ib_agent_port_private *entry; unsigned long flags; spin_lock_irqsave(&ib_agent_port_list_lock, flags); entry = __ib_get_agent_port(device, port_num); spin_unlock_irqrestore(&ib_agent_port_list_lock, flags); return entry; } void agent_send_response(const struct ib_mad_hdr *mad_hdr, const struct ib_grh *grh, const struct ib_wc *wc, const struct ib_device *device, int port_num, int qpn, size_t resp_mad_len, bool opa) { struct ib_agent_port_private *port_priv; struct ib_mad_agent *agent; struct ib_mad_send_buf *send_buf; struct ib_ah *ah; struct ib_mad_send_wr_private *mad_send_wr; if (rdma_cap_ib_switch(device)) port_priv = ib_get_agent_port(device, 0); else port_priv = ib_get_agent_port(device, port_num); if (!port_priv) { dev_err(&device->dev, "Unable to find port agent\n"); return; } agent = port_priv->agent[qpn]; ah = ib_create_ah_from_wc(agent->qp->pd, wc, grh, port_num); if (IS_ERR(ah)) { dev_err(&device->dev, "ib_create_ah_from_wc error %pe\n", ah); return; } if (opa && mad_hdr->base_version != OPA_MGMT_BASE_VERSION) resp_mad_len = IB_MGMT_MAD_SIZE; send_buf = ib_create_send_mad(agent, wc->src_qp, wc->pkey_index, 0, IB_MGMT_MAD_HDR, resp_mad_len - IB_MGMT_MAD_HDR, GFP_KERNEL, mad_hdr->base_version); if (IS_ERR(send_buf)) { dev_err(&device->dev, "ib_create_send_mad error\n"); goto err1; } memcpy(send_buf->mad, mad_hdr, resp_mad_len); send_buf->ah = ah; if (rdma_cap_ib_switch(device)) { mad_send_wr = container_of(send_buf, struct ib_mad_send_wr_private, send_buf); mad_send_wr->send_wr.port_num = port_num; } if (ib_post_send_mad(send_buf, NULL)) { dev_err(&device->dev, "ib_post_send_mad error\n"); goto err2; } return; err2: ib_free_send_mad(send_buf); err1: rdma_destroy_ah(ah, RDMA_DESTROY_AH_SLEEPABLE); } static void agent_send_handler(struct ib_mad_agent *mad_agent, struct ib_mad_send_wc *mad_send_wc) { rdma_destroy_ah(mad_send_wc->send_buf->ah, RDMA_DESTROY_AH_SLEEPABLE); ib_free_send_mad(mad_send_wc->send_buf); } int ib_agent_port_open(struct ib_device *device, int port_num) { struct ib_agent_port_private *port_priv; unsigned long flags; int ret; /* Create new device info */ port_priv = kzalloc(sizeof *port_priv, GFP_KERNEL); if (!port_priv) { ret = -ENOMEM; goto error1; } if (rdma_cap_ib_smi(device, port_num)) { /* Obtain send only MAD agent for SMI QP */ port_priv->agent[0] = ib_register_mad_agent(device, port_num, IB_QPT_SMI, NULL, 0, &agent_send_handler, NULL, NULL, 0); if (IS_ERR(port_priv->agent[0])) { ret = PTR_ERR(port_priv->agent[0]); goto error2; } } if (rdma_cap_ib_cm(device, port_num)) { /* Obtain send only MAD agent for GSI QP */ port_priv->agent[1] = ib_register_mad_agent(device, port_num, IB_QPT_GSI, NULL, 0, &agent_send_handler, NULL, NULL, 0); if (IS_ERR(port_priv->agent[1])) { ret = PTR_ERR(port_priv->agent[1]); goto error3; } } spin_lock_irqsave(&ib_agent_port_list_lock, flags); list_add_tail(&port_priv->port_list, &ib_agent_port_list); spin_unlock_irqrestore(&ib_agent_port_list_lock, flags); return 0; error3: if (port_priv->agent[0]) ib_unregister_mad_agent(port_priv->agent[0]); error2: kfree(port_priv); error1: return ret; } int ib_agent_port_close(struct ib_device *device, int port_num) { struct ib_agent_port_private *port_priv; unsigned long flags; spin_lock_irqsave(&ib_agent_port_list_lock, flags); port_priv = __ib_get_agent_port(device, port_num); if (port_priv == NULL) { spin_unlock_irqrestore(&ib_agent_port_list_lock, flags); dev_err(&device->dev, "Port %d not found\n", port_num); return -ENODEV; } list_del(&port_priv->port_list); spin_unlock_irqrestore(&ib_agent_port_list_lock, flags); if (port_priv->agent[1]) ib_unregister_mad_agent(port_priv->agent[1]); if (port_priv->agent[0]) ib_unregister_mad_agent(port_priv->agent[0]); kfree(port_priv); return 0; }
5627 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PAGE_64_H #define _ASM_X86_PAGE_64_H #include <asm/page_64_types.h> #ifndef __ASSEMBLER__ #include <asm/cpufeatures.h> #include <asm/alternative.h> #include <linux/kmsan-checks.h> #include <linux/mmdebug.h> /* duplicated to the one in bootmem.h */ extern unsigned long max_pfn; extern unsigned long phys_base; extern unsigned long page_offset_base; extern unsigned long vmalloc_base; extern unsigned long vmemmap_base; extern unsigned long direct_map_physmem_end; static __always_inline unsigned long __phys_addr_nodebug(unsigned long x) { unsigned long y = x - __START_KERNEL_map; /* use the carry flag to determine if x was < __START_KERNEL_map */ x = y + ((x > y) ? phys_base : (__START_KERNEL_map - PAGE_OFFSET)); return x; } #ifdef CONFIG_DEBUG_VIRTUAL extern unsigned long __phys_addr(unsigned long); #else #define __phys_addr(x) __phys_addr_nodebug(x) #endif static inline unsigned long __phys_addr_symbol(unsigned long x) { unsigned long y = x - __START_KERNEL_map; /* only check upper bounds since lower bounds will trigger carry */ VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE); return y + phys_base; } #define __phys_reloc_hide(x) (x) void clear_page_orig(void *page); void clear_page_rep(void *page); void clear_page_erms(void *page); KCFI_REFERENCE(clear_page_orig); KCFI_REFERENCE(clear_page_rep); KCFI_REFERENCE(clear_page_erms); static inline void clear_page(void *page) { /* * Clean up KMSAN metadata for the page being cleared. The assembly call * below clobbers @page, so we perform unpoisoning before it. */ kmsan_unpoison_memory(page, PAGE_SIZE); alternative_call_2(clear_page_orig, clear_page_rep, X86_FEATURE_REP_GOOD, clear_page_erms, X86_FEATURE_ERMS, "=D" (page), "D" (page), "cc", "memory", "rax", "rcx"); } void copy_page(void *to, void *from); KCFI_REFERENCE(copy_page); /* * User space process size. This is the first address outside the user range. * There are a few constraints that determine this: * * On Intel CPUs, if a SYSCALL instruction is at the highest canonical * address, then that syscall will enter the kernel with a * non-canonical return address, and SYSRET will explode dangerously. * We avoid this particular problem by preventing anything * from being mapped at the maximum canonical address. * * On AMD CPUs in the Ryzen family, there's a nasty bug in which the * CPUs malfunction if they execute code from the highest canonical page. * They'll speculate right off the end of the canonical space, and * bad things happen. This is worked around in the same way as the * Intel problem. * * With page table isolation enabled, we map the LDT in ... [stay tuned] */ static __always_inline unsigned long task_size_max(void) { unsigned long ret; alternative_io("movq %[small],%0","movq %[large],%0", X86_FEATURE_LA57, "=r" (ret), [small] "i" ((1ul << 47)-PAGE_SIZE), [large] "i" ((1ul << 56)-PAGE_SIZE)); return ret; } #endif /* !__ASSEMBLER__ */ #ifdef CONFIG_X86_VSYSCALL_EMULATION # define __HAVE_ARCH_GATE_AREA 1 #endif #endif /* _ASM_X86_PAGE_64_H */
3 1095 10 58 26 1 110 737 217 209 34 13 10 124 551 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_FORTIFY_STRING_H_ #define _LINUX_FORTIFY_STRING_H_ #include <linux/bitfield.h> #include <linux/bug.h> #include <linux/const.h> #include <linux/limits.h> #define __FORTIFY_INLINE extern __always_inline __gnu_inline __overloadable #define __RENAME(x) __asm__(#x) #define FORTIFY_REASON_DIR(r) FIELD_GET(BIT(0), r) #define FORTIFY_REASON_FUNC(r) FIELD_GET(GENMASK(7, 1), r) #define FORTIFY_REASON(func, write) (FIELD_PREP(BIT(0), write) | \ FIELD_PREP(GENMASK(7, 1), func)) /* Overridden by KUnit tests. */ #ifndef fortify_panic # define fortify_panic(func, write, avail, size, retfail) \ __fortify_panic(FORTIFY_REASON(func, write), avail, size) #endif #ifndef fortify_warn_once # define fortify_warn_once(x...) WARN_ONCE(x) #endif #define FORTIFY_READ 0 #define FORTIFY_WRITE 1 #define EACH_FORTIFY_FUNC(macro) \ macro(strncpy), \ macro(strnlen), \ macro(strlen), \ macro(strscpy), \ macro(strlcat), \ macro(strcat), \ macro(strncat), \ macro(memset), \ macro(memcpy), \ macro(memmove), \ macro(memscan), \ macro(memcmp), \ macro(memchr), \ macro(memchr_inv), \ macro(kmemdup), \ macro(strcpy), \ macro(UNKNOWN), #define MAKE_FORTIFY_FUNC(func) FORTIFY_FUNC_##func enum fortify_func { EACH_FORTIFY_FUNC(MAKE_FORTIFY_FUNC) }; void __fortify_report(const u8 reason, const size_t avail, const size_t size); void __fortify_panic(const u8 reason, const size_t avail, const size_t size) __cold __noreturn; void __read_overflow(void) __compiletime_error("detected read beyond size of object (1st parameter)"); void __read_overflow2(void) __compiletime_error("detected read beyond size of object (2nd parameter)"); void __read_overflow2_field(size_t avail, size_t wanted) __compiletime_warning("detected read beyond size of field (2nd parameter); maybe use struct_group()?"); void __write_overflow(void) __compiletime_error("detected write beyond size of object (1st parameter)"); void __write_overflow_field(size_t avail, size_t wanted) __compiletime_warning("detected write beyond size of field (1st parameter); maybe use struct_group()?"); #define __compiletime_strlen(p) \ ({ \ char *__p = (char *)(p); \ size_t __ret = SIZE_MAX; \ const size_t __p_size = __member_size(p); \ if (__p_size != SIZE_MAX && \ __builtin_constant_p(*__p)) { \ size_t __p_len = __p_size - 1; \ if (__builtin_constant_p(__p[__p_len]) && \ __p[__p_len] == '\0') \ __ret = __builtin_strlen(__p); \ } \ __ret; \ }) #if defined(__SANITIZE_ADDRESS__) #if !defined(CONFIG_CC_HAS_KASAN_MEMINTRINSIC_PREFIX) && !defined(CONFIG_GENERIC_ENTRY) extern void *__underlying_memset(void *p, int c, __kernel_size_t size) __RENAME(memset); extern void *__underlying_memmove(void *p, const void *q, __kernel_size_t size) __RENAME(memmove); extern void *__underlying_memcpy(void *p, const void *q, __kernel_size_t size) __RENAME(memcpy); #elif defined(CONFIG_KASAN_GENERIC) extern void *__underlying_memset(void *p, int c, __kernel_size_t size) __RENAME(__asan_memset); extern void *__underlying_memmove(void *p, const void *q, __kernel_size_t size) __RENAME(__asan_memmove); extern void *__underlying_memcpy(void *p, const void *q, __kernel_size_t size) __RENAME(__asan_memcpy); #else /* CONFIG_KASAN_SW_TAGS */ extern void *__underlying_memset(void *p, int c, __kernel_size_t size) __RENAME(__hwasan_memset); extern void *__underlying_memmove(void *p, const void *q, __kernel_size_t size) __RENAME(__hwasan_memmove); extern void *__underlying_memcpy(void *p, const void *q, __kernel_size_t size) __RENAME(__hwasan_memcpy); #endif extern void *__underlying_memchr(const void *p, int c, __kernel_size_t size) __RENAME(memchr); extern int __underlying_memcmp(const void *p, const void *q, __kernel_size_t size) __RENAME(memcmp); extern char *__underlying_strcat(char *p, const char *q) __RENAME(strcat); extern char *__underlying_strcpy(char *p, const char *q) __RENAME(strcpy); extern __kernel_size_t __underlying_strlen(const char *p) __RENAME(strlen); extern char *__underlying_strncat(char *p, const char *q, __kernel_size_t count) __RENAME(strncat); extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size) __RENAME(strncpy); #else #if defined(__SANITIZE_MEMORY__) /* * For KMSAN builds all memcpy/memset/memmove calls should be replaced by the * corresponding __msan_XXX functions. */ #include <linux/kmsan_string.h> #define __underlying_memcpy __msan_memcpy #define __underlying_memmove __msan_memmove #define __underlying_memset __msan_memset #else #define __underlying_memcpy __builtin_memcpy #define __underlying_memmove __builtin_memmove #define __underlying_memset __builtin_memset #endif #define __underlying_memchr __builtin_memchr #define __underlying_memcmp __builtin_memcmp #define __underlying_strcat __builtin_strcat #define __underlying_strcpy __builtin_strcpy #define __underlying_strlen __builtin_strlen #define __underlying_strncat __builtin_strncat #define __underlying_strncpy __builtin_strncpy #endif /** * unsafe_memcpy - memcpy implementation with no FORTIFY bounds checking * * @dst: Destination memory address to write to * @src: Source memory address to read from * @bytes: How many bytes to write to @dst from @src * @justification: Free-form text or comment describing why the use is needed * * This should be used for corner cases where the compiler cannot do the * right thing, or during transitions between APIs, etc. It should be used * very rarely, and includes a place for justification detailing where bounds * checking has happened, and why existing solutions cannot be employed. */ #define unsafe_memcpy(dst, src, bytes, justification) \ __underlying_memcpy(dst, src, bytes) /* * Clang's use of __builtin_*object_size() within inlines needs hinting via * __pass_*object_size(). The preference is to only ever use type 1 (member * size, rather than struct size), but there remain some stragglers using * type 0 that will be converted in the future. */ #if __has_builtin(__builtin_dynamic_object_size) #define POS __pass_dynamic_object_size(1) #define POS0 __pass_dynamic_object_size(0) #else #define POS __pass_object_size(1) #define POS0 __pass_object_size(0) #endif #define __compiletime_lessthan(bounds, length) ( \ __builtin_constant_p((bounds) < (length)) && \ (bounds) < (length) \ ) /** * strncpy - Copy a string to memory with non-guaranteed NUL padding * * @p: pointer to destination of copy * @q: pointer to NUL-terminated source string to copy * @size: bytes to write at @p * * If strlen(@q) >= @size, the copy of @q will stop after @size bytes, * and @p will NOT be NUL-terminated * * If strlen(@q) < @size, following the copy of @q, trailing NUL bytes * will be written to @p until @size total bytes have been written. * * Do not use this function. While FORTIFY_SOURCE tries to avoid * over-reads of @q, it cannot defend against writing unterminated * results to @p. Using strncpy() remains ambiguous and fragile. * Instead, please choose an alternative, so that the expectation * of @p's contents is unambiguous: * * +--------------------+--------------------+------------+ * | **p** needs to be: | padded to **size** | not padded | * +====================+====================+============+ * | NUL-terminated | strscpy_pad() | strscpy() | * +--------------------+--------------------+------------+ * | not NUL-terminated | strtomem_pad() | strtomem() | * +--------------------+--------------------+------------+ * * Note strscpy*()'s differing return values for detecting truncation, * and strtomem*()'s expectation that the destination is marked with * __nonstring when it is a character array. * */ __FORTIFY_INLINE __diagnose_as(__builtin_strncpy, 1, 2, 3) char *strncpy(char * const POS p, const char *q, __kernel_size_t size) { const size_t p_size = __member_size(p); if (__compiletime_lessthan(p_size, size)) __write_overflow(); if (p_size < size) fortify_panic(FORTIFY_FUNC_strncpy, FORTIFY_WRITE, p_size, size, p); return __underlying_strncpy(p, q, size); } extern __kernel_size_t __real_strnlen(const char *, __kernel_size_t) __RENAME(strnlen); /** * strnlen - Return bounded count of characters in a NUL-terminated string * * @p: pointer to NUL-terminated string to count. * @maxlen: maximum number of characters to count. * * Returns number of characters in @p (NOT including the final NUL), or * @maxlen, if no NUL has been found up to there. * */ __FORTIFY_INLINE __kernel_size_t strnlen(const char * const POS p, __kernel_size_t maxlen) { const size_t p_size = __member_size(p); const size_t p_len = __compiletime_strlen(p); size_t ret; /* We can take compile-time actions when maxlen is const. */ if (__builtin_constant_p(maxlen) && p_len != SIZE_MAX) { /* If p is const, we can use its compile-time-known len. */ if (maxlen >= p_size) return p_len; } /* Do not check characters beyond the end of p. */ ret = __real_strnlen(p, maxlen < p_size ? maxlen : p_size); if (p_size <= ret && maxlen != ret) fortify_panic(FORTIFY_FUNC_strnlen, FORTIFY_READ, p_size, ret + 1, ret); return ret; } /* * Defined after fortified strnlen to reuse it. However, it must still be * possible for strlen() to be used on compile-time strings for use in * static initializers (i.e. as a constant expression). */ /** * strlen - Return count of characters in a NUL-terminated string * * @p: pointer to NUL-terminated string to count. * * Do not use this function unless the string length is known at * compile-time. When @p is unterminated, this function may crash * or return unexpected counts that could lead to memory content * exposures. Prefer strnlen(). * * Returns number of characters in @p (NOT including the final NUL). * */ #define strlen(p) \ __builtin_choose_expr(__is_constexpr(__builtin_strlen(p)), \ __builtin_strlen(p), __fortify_strlen(p)) __FORTIFY_INLINE __diagnose_as(__builtin_strlen, 1) __kernel_size_t __fortify_strlen(const char * const POS p) { const size_t p_size = __member_size(p); __kernel_size_t ret; /* Give up if we don't know how large p is. */ if (p_size == SIZE_MAX) return __underlying_strlen(p); ret = strnlen(p, p_size); if (p_size <= ret) fortify_panic(FORTIFY_FUNC_strlen, FORTIFY_READ, p_size, ret + 1, ret); return ret; } /* Defined after fortified strnlen() to reuse it. */ extern ssize_t __real_strscpy(char *, const char *, size_t) __RENAME(sized_strscpy); __FORTIFY_INLINE ssize_t sized_strscpy(char * const POS p, const char * const POS q, size_t size) { /* Use string size rather than possible enclosing struct size. */ const size_t p_size = __member_size(p); const size_t q_size = __member_size(q); size_t len; /* If we cannot get size of p and q default to call strscpy. */ if (p_size == SIZE_MAX && q_size == SIZE_MAX) return __real_strscpy(p, q, size); /* * If size can be known at compile time and is greater than * p_size, generate a compile time write overflow error. */ if (__compiletime_lessthan(p_size, size)) __write_overflow(); /* Short-circuit for compile-time known-safe lengths. */ if (__compiletime_lessthan(p_size, SIZE_MAX)) { len = __compiletime_strlen(q); if (len < SIZE_MAX && __compiletime_lessthan(len, size)) { __underlying_memcpy(p, q, len + 1); return len; } } /* * This call protects from read overflow, because len will default to q * length if it smaller than size. */ len = strnlen(q, size); /* * If len equals size, we will copy only size bytes which leads to * -E2BIG being returned. * Otherwise we will copy len + 1 because of the final '\O'. */ len = len == size ? size : len + 1; /* * Generate a runtime write overflow error if len is greater than * p_size. */ if (p_size < len) fortify_panic(FORTIFY_FUNC_strscpy, FORTIFY_WRITE, p_size, len, -E2BIG); /* * We can now safely call vanilla strscpy because we are protected from: * 1. Read overflow thanks to call to strnlen(). * 2. Write overflow thanks to above ifs. */ return __real_strscpy(p, q, len); } /* Defined after fortified strlen() to reuse it. */ extern size_t __real_strlcat(char *p, const char *q, size_t avail) __RENAME(strlcat); /** * strlcat - Append a string to an existing string * * @p: pointer to %NUL-terminated string to append to * @q: pointer to %NUL-terminated string to append from * @avail: Maximum bytes available in @p * * Appends %NUL-terminated string @q after the %NUL-terminated * string at @p, but will not write beyond @avail bytes total, * potentially truncating the copy from @q. @p will stay * %NUL-terminated only if a %NUL already existed within * the @avail bytes of @p. If so, the resulting number of * bytes copied from @q will be at most "@avail - strlen(@p) - 1". * * Do not use this function. While FORTIFY_SOURCE tries to avoid * read and write overflows, this is only possible when the sizes * of @p and @q are known to the compiler. Prefer building the * string with formatting, via scnprintf(), seq_buf, or similar. * * Returns total bytes that _would_ have been contained by @p * regardless of truncation, similar to snprintf(). If return * value is >= @avail, the string has been truncated. * */ __FORTIFY_INLINE size_t strlcat(char * const POS p, const char * const POS q, size_t avail) { const size_t p_size = __member_size(p); const size_t q_size = __member_size(q); size_t p_len, copy_len; size_t actual, wanted; /* Give up immediately if both buffer sizes are unknown. */ if (p_size == SIZE_MAX && q_size == SIZE_MAX) return __real_strlcat(p, q, avail); p_len = strnlen(p, avail); copy_len = strlen(q); wanted = actual = p_len + copy_len; /* Cannot append any more: report truncation. */ if (avail <= p_len) return wanted; /* Give up if string is already overflowed. */ if (p_size <= p_len) fortify_panic(FORTIFY_FUNC_strlcat, FORTIFY_READ, p_size, p_len + 1, wanted); if (actual >= avail) { copy_len = avail - p_len - 1; actual = p_len + copy_len; } /* Give up if copy will overflow. */ if (p_size <= actual) fortify_panic(FORTIFY_FUNC_strlcat, FORTIFY_WRITE, p_size, actual + 1, wanted); __underlying_memcpy(p + p_len, q, copy_len); p[actual] = '\0'; return wanted; } /* Defined after fortified strlcat() to reuse it. */ /** * strcat - Append a string to an existing string * * @p: pointer to NUL-terminated string to append to * @q: pointer to NUL-terminated source string to append from * * Do not use this function. While FORTIFY_SOURCE tries to avoid * read and write overflows, this is only possible when the * destination buffer size is known to the compiler. Prefer * building the string with formatting, via scnprintf() or similar. * At the very least, use strncat(). * * Returns @p. * */ __FORTIFY_INLINE __diagnose_as(__builtin_strcat, 1, 2) char *strcat(char * const POS p, const char *q) { const size_t p_size = __member_size(p); const size_t wanted = strlcat(p, q, p_size); if (p_size <= wanted) fortify_panic(FORTIFY_FUNC_strcat, FORTIFY_WRITE, p_size, wanted + 1, p); return p; } /** * strncat - Append a string to an existing string * * @p: pointer to NUL-terminated string to append to * @q: pointer to source string to append from * @count: Maximum bytes to read from @q * * Appends at most @count bytes from @q (stopping at the first * NUL byte) after the NUL-terminated string at @p. @p will be * NUL-terminated. * * Do not use this function. While FORTIFY_SOURCE tries to avoid * read and write overflows, this is only possible when the sizes * of @p and @q are known to the compiler. Prefer building the * string with formatting, via scnprintf() or similar. * * Returns @p. * */ /* Defined after fortified strlen() and strnlen() to reuse them. */ __FORTIFY_INLINE __diagnose_as(__builtin_strncat, 1, 2, 3) char *strncat(char * const POS p, const char * const POS q, __kernel_size_t count) { const size_t p_size = __member_size(p); const size_t q_size = __member_size(q); size_t p_len, copy_len, total; if (p_size == SIZE_MAX && q_size == SIZE_MAX) return __underlying_strncat(p, q, count); p_len = strlen(p); copy_len = strnlen(q, count); total = p_len + copy_len + 1; if (p_size < total) fortify_panic(FORTIFY_FUNC_strncat, FORTIFY_WRITE, p_size, total, p); __underlying_memcpy(p + p_len, q, copy_len); p[p_len + copy_len] = '\0'; return p; } __FORTIFY_INLINE bool fortify_memset_chk(__kernel_size_t size, const size_t p_size, const size_t p_size_field) { if (__builtin_constant_p(size)) { /* * Length argument is a constant expression, so we * can perform compile-time bounds checking where * buffer sizes are also known at compile time. */ /* Error when size is larger than enclosing struct. */ if (__compiletime_lessthan(p_size_field, p_size) && __compiletime_lessthan(p_size, size)) __write_overflow(); /* Warn when write size is larger than dest field. */ if (__compiletime_lessthan(p_size_field, size)) __write_overflow_field(p_size_field, size); } /* * At this point, length argument may not be a constant expression, * so run-time bounds checking can be done where buffer sizes are * known. (This is not an "else" because the above checks may only * be compile-time warnings, and we want to still warn for run-time * overflows.) */ /* * Always stop accesses beyond the struct that contains the * field, when the buffer's remaining size is known. * (The SIZE_MAX test is to optimize away checks where the buffer * lengths are unknown.) */ if (p_size != SIZE_MAX && p_size < size) fortify_panic(FORTIFY_FUNC_memset, FORTIFY_WRITE, p_size, size, true); return false; } #define __fortify_memset_chk(p, c, size, p_size, p_size_field) ({ \ size_t __fortify_size = (size_t)(size); \ fortify_memset_chk(__fortify_size, p_size, p_size_field), \ __underlying_memset(p, c, __fortify_size); \ }) /* * __struct_size() vs __member_size() must be captured here to avoid * evaluating argument side-effects further into the macro layers. */ #ifndef CONFIG_KMSAN #define memset(p, c, s) __fortify_memset_chk(p, c, s, \ __struct_size(p), __member_size(p)) #endif /* * To make sure the compiler can enforce protection against buffer overflows, * memcpy(), memmove(), and memset() must not be used beyond individual * struct members. If you need to copy across multiple members, please use * struct_group() to create a named mirror of an anonymous struct union. * (e.g. see struct sk_buff.) Read overflow checking is currently only * done when a write overflow is also present, or when building with W=1. * * Mitigation coverage matrix * Bounds checking at: * +-------+-------+-------+-------+ * | Compile time | Run time | * memcpy() argument sizes: | write | read | write | read | * dest source length +-------+-------+-------+-------+ * memcpy(known, known, constant) | y | y | n/a | n/a | * memcpy(known, unknown, constant) | y | n | n/a | V | * memcpy(known, known, dynamic) | n | n | B | B | * memcpy(known, unknown, dynamic) | n | n | B | V | * memcpy(unknown, known, constant) | n | y | V | n/a | * memcpy(unknown, unknown, constant) | n | n | V | V | * memcpy(unknown, known, dynamic) | n | n | V | B | * memcpy(unknown, unknown, dynamic) | n | n | V | V | * +-------+-------+-------+-------+ * * y = perform deterministic compile-time bounds checking * n = cannot perform deterministic compile-time bounds checking * n/a = no run-time bounds checking needed since compile-time deterministic * B = can perform run-time bounds checking (currently unimplemented) * V = vulnerable to run-time overflow (will need refactoring to solve) * */ __FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size, const size_t p_size, const size_t q_size, const size_t p_size_field, const size_t q_size_field, const u8 func) { if (__builtin_constant_p(size)) { /* * Length argument is a constant expression, so we * can perform compile-time bounds checking where * buffer sizes are also known at compile time. */ /* Error when size is larger than enclosing struct. */ if (__compiletime_lessthan(p_size_field, p_size) && __compiletime_lessthan(p_size, size)) __write_overflow(); if (__compiletime_lessthan(q_size_field, q_size) && __compiletime_lessthan(q_size, size)) __read_overflow2(); /* Warn when write size argument larger than dest field. */ if (__compiletime_lessthan(p_size_field, size)) __write_overflow_field(p_size_field, size); /* * Warn for source field over-read when building with W=1 * or when an over-write happened, so both can be fixed at * the same time. */ if ((IS_ENABLED(KBUILD_EXTRA_WARN1) || __compiletime_lessthan(p_size_field, size)) && __compiletime_lessthan(q_size_field, size)) __read_overflow2_field(q_size_field, size); } /* * At this point, length argument may not be a constant expression, * so run-time bounds checking can be done where buffer sizes are * known. (This is not an "else" because the above checks may only * be compile-time warnings, and we want to still warn for run-time * overflows.) */ /* * Always stop accesses beyond the struct that contains the * field, when the buffer's remaining size is known. * (The SIZE_MAX test is to optimize away checks where the buffer * lengths are unknown.) */ if (p_size != SIZE_MAX && p_size < size) fortify_panic(func, FORTIFY_WRITE, p_size, size, true); else if (q_size != SIZE_MAX && q_size < size) fortify_panic(func, FORTIFY_READ, q_size, size, true); /* * Warn when writing beyond destination field size. * * Note the implementation of __builtin_*object_size() behaves * like sizeof() when not directly referencing a flexible * array member, which means there will be many bounds checks * that will appear at run-time, without a way for them to be * detected at compile-time (as can be done when the destination * is specifically the flexible array member). * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101832 */ if (p_size_field != SIZE_MAX && p_size != p_size_field && p_size_field < size) return true; return false; } /* * To work around what seems to be an optimizer bug, the macro arguments * need to have const copies or the values end up changed by the time they * reach fortify_warn_once(). See commit 6f7630b1b5bc ("fortify: Capture * __bos() results in const temp vars") for more details. */ #define __fortify_memcpy_chk(p, q, size, p_size, q_size, \ p_size_field, q_size_field, op) ({ \ const size_t __fortify_size = (size_t)(size); \ const size_t __p_size = (p_size); \ const size_t __q_size = (q_size); \ const size_t __p_size_field = (p_size_field); \ const size_t __q_size_field = (q_size_field); \ /* Keep a mutable version of the size for the final copy. */ \ size_t __copy_size = __fortify_size; \ fortify_warn_once(fortify_memcpy_chk(__fortify_size, __p_size, \ __q_size, __p_size_field, \ __q_size_field, FORTIFY_FUNC_ ##op), \ #op ": detected field-spanning write (size %zu) of single %s (size %zu)\n", \ __fortify_size, \ "field \"" #p "\" at " FILE_LINE, \ __p_size_field); \ /* Hide only the run-time size from value range tracking to */ \ /* silence compile-time false positive bounds warnings. */ \ if (!__builtin_constant_p(__copy_size)) \ OPTIMIZER_HIDE_VAR(__copy_size); \ __underlying_##op(p, q, __copy_size); \ }) /* * Notes about compile-time buffer size detection: * * With these types... * * struct middle { * u16 a; * u8 middle_buf[16]; * int b; * }; * struct end { * u16 a; * u8 end_buf[16]; * }; * struct flex { * int a; * u8 flex_buf[]; * }; * * void func(TYPE *ptr) { ... } * * Cases where destination size cannot be currently detected: * - the size of ptr's object (seemingly by design, gcc & clang fail): * __builtin_object_size(ptr, 1) == SIZE_MAX * - the size of flexible arrays in ptr's obj (by design, dynamic size): * __builtin_object_size(ptr->flex_buf, 1) == SIZE_MAX * - the size of ANY array at the end of ptr's obj (gcc and clang bug): * __builtin_object_size(ptr->end_buf, 1) == SIZE_MAX * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101836 * * Cases where destination size is currently detected: * - the size of non-array members within ptr's object: * __builtin_object_size(ptr->a, 1) == 2 * - the size of non-flexible-array in the middle of ptr's obj: * __builtin_object_size(ptr->middle_buf, 1) == 16 * */ /* * __struct_size() vs __member_size() must be captured here to avoid * evaluating argument side-effects further into the macro layers. */ #define memcpy(p, q, s) __fortify_memcpy_chk(p, q, s, \ __struct_size(p), __struct_size(q), \ __member_size(p), __member_size(q), \ memcpy) #define memmove(p, q, s) __fortify_memcpy_chk(p, q, s, \ __struct_size(p), __struct_size(q), \ __member_size(p), __member_size(q), \ memmove) extern void *__real_memscan(void *, int, __kernel_size_t) __RENAME(memscan); __FORTIFY_INLINE void *memscan(void * const POS0 p, int c, __kernel_size_t size) { const size_t p_size = __struct_size(p); if (__compiletime_lessthan(p_size, size)) __read_overflow(); if (p_size < size) fortify_panic(FORTIFY_FUNC_memscan, FORTIFY_READ, p_size, size, NULL); return __real_memscan(p, c, size); } __FORTIFY_INLINE __diagnose_as(__builtin_memcmp, 1, 2, 3) int memcmp(const void * const POS0 p, const void * const POS0 q, __kernel_size_t size) { const size_t p_size = __struct_size(p); const size_t q_size = __struct_size(q); if (__builtin_constant_p(size)) { if (__compiletime_lessthan(p_size, size)) __read_overflow(); if (__compiletime_lessthan(q_size, size)) __read_overflow2(); } if (p_size < size) fortify_panic(FORTIFY_FUNC_memcmp, FORTIFY_READ, p_size, size, INT_MIN); else if (q_size < size) fortify_panic(FORTIFY_FUNC_memcmp, FORTIFY_READ, q_size, size, INT_MIN); return __underlying_memcmp(p, q, size); } __FORTIFY_INLINE __diagnose_as(__builtin_memchr, 1, 2, 3) void *memchr(const void * const POS0 p, int c, __kernel_size_t size) { const size_t p_size = __struct_size(p); if (__compiletime_lessthan(p_size, size)) __read_overflow(); if (p_size < size) fortify_panic(FORTIFY_FUNC_memchr, FORTIFY_READ, p_size, size, NULL); return __underlying_memchr(p, c, size); } void *__real_memchr_inv(const void *s, int c, size_t n) __RENAME(memchr_inv); __FORTIFY_INLINE void *memchr_inv(const void * const POS0 p, int c, size_t size) { const size_t p_size = __struct_size(p); if (__compiletime_lessthan(p_size, size)) __read_overflow(); if (p_size < size) fortify_panic(FORTIFY_FUNC_memchr_inv, FORTIFY_READ, p_size, size, NULL); return __real_memchr_inv(p, c, size); } extern void *__real_kmemdup(const void *src, size_t len, gfp_t gfp) __RENAME(kmemdup_noprof) __realloc_size(2); __FORTIFY_INLINE void *kmemdup_noprof(const void * const POS0 p, size_t size, gfp_t gfp) { const size_t p_size = __struct_size(p); if (__compiletime_lessthan(p_size, size)) __read_overflow(); if (p_size < size) fortify_panic(FORTIFY_FUNC_kmemdup, FORTIFY_READ, p_size, size, __real_kmemdup(p, 0, gfp)); return __real_kmemdup(p, size, gfp); } #define kmemdup(...) alloc_hooks(kmemdup_noprof(__VA_ARGS__)) /** * strcpy - Copy a string into another string buffer * * @p: pointer to destination of copy * @q: pointer to NUL-terminated source string to copy * * Do not use this function. While FORTIFY_SOURCE tries to avoid * overflows, this is only possible when the sizes of @q and @p are * known to the compiler. Prefer strscpy(), though note its different * return values for detecting truncation. * * Returns @p. * */ /* Defined after fortified strlen to reuse it. */ __FORTIFY_INLINE __diagnose_as(__builtin_strcpy, 1, 2) char *strcpy(char * const POS p, const char * const POS q) { const size_t p_size = __member_size(p); const size_t q_size = __member_size(q); size_t size; /* If neither buffer size is known, immediately give up. */ if (__builtin_constant_p(p_size) && __builtin_constant_p(q_size) && p_size == SIZE_MAX && q_size == SIZE_MAX) return __underlying_strcpy(p, q); size = strlen(q) + 1; /* Compile-time check for const size overflow. */ if (__compiletime_lessthan(p_size, size)) __write_overflow(); /* Run-time check for dynamic size overflow. */ if (p_size < size) fortify_panic(FORTIFY_FUNC_strcpy, FORTIFY_WRITE, p_size, size, p); __underlying_memcpy(p, q, size); return p; } /* Don't use these outside the FORITFY_SOURCE implementation */ #undef __underlying_memchr #undef __underlying_memcmp #undef __underlying_strcat #undef __underlying_strcpy #undef __underlying_strlen #undef __underlying_strncat #undef __underlying_strncpy #undef POS #undef POS0 #endif /* _LINUX_FORTIFY_STRING_H_ */
6 6 6 6 6 6 6 6 6 6 6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2005,2006,2007,2008 IBM Corporation * * Authors: * Serge Hallyn <serue@us.ibm.com> * Reiner Sailer <sailer@watson.ibm.com> * Mimi Zohar <zohar@us.ibm.com> * * File: ima_queue.c * Implements queues that store template measurements and * maintains aggregate over the stored measurements * in the pre-configured TPM PCR (if available). * The measurement list is append-only. No entry is * ever removed or changed during the boot-cycle. */ #include <linux/rculist.h> #include <linux/reboot.h> #include <linux/slab.h> #include "ima.h" #define AUDIT_CAUSE_LEN_MAX 32 /* pre-allocated array of tpm_digest structures to extend a PCR */ static struct tpm_digest *digests; LIST_HEAD(ima_measurements); /* list of all measurements */ #ifdef CONFIG_IMA_KEXEC static unsigned long binary_runtime_size; #else static unsigned long binary_runtime_size = ULONG_MAX; #endif /* key: inode (before secure-hashing a file) */ struct ima_h_table ima_htable = { .len = ATOMIC_LONG_INIT(0), .violations = ATOMIC_LONG_INIT(0), .queue[0 ... IMA_MEASURE_HTABLE_SIZE - 1] = HLIST_HEAD_INIT }; /* mutex protects atomicity of extending measurement list * and extending the TPM PCR aggregate. Since tpm_extend can take * long (and the tpm driver uses a mutex), we can't use the spinlock. */ static DEFINE_MUTEX(ima_extend_list_mutex); /* * Used internally by the kernel to suspend measurements. * Protected by ima_extend_list_mutex. */ static bool ima_measurements_suspended; /* lookup up the digest value in the hash table, and return the entry */ static struct ima_queue_entry *ima_lookup_digest_entry(u8 *digest_value, int pcr) { struct ima_queue_entry *qe, *ret = NULL; unsigned int key; int rc; key = ima_hash_key(digest_value); rcu_read_lock(); hlist_for_each_entry_rcu(qe, &ima_htable.queue[key], hnext) { rc = memcmp(qe->entry->digests[ima_hash_algo_idx].digest, digest_value, hash_digest_size[ima_hash_algo]); if ((rc == 0) && (qe->entry->pcr == pcr)) { ret = qe; break; } } rcu_read_unlock(); return ret; } /* * Calculate the memory required for serializing a single * binary_runtime_measurement list entry, which contains a * couple of variable length fields (e.g template name and data). */ static int get_binary_runtime_size(struct ima_template_entry *entry) { int size = 0; size += sizeof(u32); /* pcr */ size += TPM_DIGEST_SIZE; size += sizeof(int); /* template name size field */ size += strlen(entry->template_desc->name); size += sizeof(entry->template_data_len); size += entry->template_data_len; return size; } /* ima_add_template_entry helper function: * - Add template entry to the measurement list and hash table, for * all entries except those carried across kexec. * * (Called with ima_extend_list_mutex held.) */ static int ima_add_digest_entry(struct ima_template_entry *entry, bool update_htable) { struct ima_queue_entry *qe; unsigned int key; qe = kmalloc(sizeof(*qe), GFP_KERNEL); if (qe == NULL) { pr_err("OUT OF MEMORY ERROR creating queue entry\n"); return -ENOMEM; } qe->entry = entry; INIT_LIST_HEAD(&qe->later); list_add_tail_rcu(&qe->later, &ima_measurements); atomic_long_inc(&ima_htable.len); if (update_htable) { key = ima_hash_key(entry->digests[ima_hash_algo_idx].digest); hlist_add_head_rcu(&qe->hnext, &ima_htable.queue[key]); } if (binary_runtime_size != ULONG_MAX) { int size; size = get_binary_runtime_size(entry); binary_runtime_size = (binary_runtime_size < ULONG_MAX - size) ? binary_runtime_size + size : ULONG_MAX; } return 0; } /* * Return the amount of memory required for serializing the * entire binary_runtime_measurement list, including the ima_kexec_hdr * structure. */ unsigned long ima_get_binary_runtime_size(void) { if (binary_runtime_size >= (ULONG_MAX - sizeof(struct ima_kexec_hdr))) return ULONG_MAX; else return binary_runtime_size + sizeof(struct ima_kexec_hdr); } static int ima_pcr_extend(struct tpm_digest *digests_arg, int pcr) { int result = 0; if (!ima_tpm_chip) return result; result = tpm_pcr_extend(ima_tpm_chip, pcr, digests_arg); if (result != 0) pr_err("Error Communicating to TPM chip, result: %d\n", result); return result; } /* * Add template entry to the measurement list and hash table, and * extend the pcr. * * On systems which support carrying the IMA measurement list across * kexec, maintain the total memory size required for serializing the * binary_runtime_measurements. */ int ima_add_template_entry(struct ima_template_entry *entry, int violation, const char *op, struct inode *inode, const unsigned char *filename) { u8 *digest = entry->digests[ima_hash_algo_idx].digest; struct tpm_digest *digests_arg = entry->digests; const char *audit_cause = "hash_added"; char tpm_audit_cause[AUDIT_CAUSE_LEN_MAX]; int audit_info = 1; int result = 0, tpmresult = 0; mutex_lock(&ima_extend_list_mutex); /* * Avoid appending to the measurement log when the TPM subsystem has * been shut down while preparing for system reboot. */ if (ima_measurements_suspended) { audit_cause = "measurements_suspended"; audit_info = 0; result = -ENODEV; goto out; } if (!violation && !IS_ENABLED(CONFIG_IMA_DISABLE_HTABLE)) { if (ima_lookup_digest_entry(digest, entry->pcr)) { audit_cause = "hash_exists"; result = -EEXIST; goto out; } } result = ima_add_digest_entry(entry, !IS_ENABLED(CONFIG_IMA_DISABLE_HTABLE)); if (result < 0) { audit_cause = "ENOMEM"; audit_info = 0; goto out; } if (violation) /* invalidate pcr */ digests_arg = digests; tpmresult = ima_pcr_extend(digests_arg, entry->pcr); if (tpmresult != 0) { snprintf(tpm_audit_cause, AUDIT_CAUSE_LEN_MAX, "TPM_error(%d)", tpmresult); audit_cause = tpm_audit_cause; audit_info = 0; } out: mutex_unlock(&ima_extend_list_mutex); integrity_audit_msg(AUDIT_INTEGRITY_PCR, inode, filename, op, audit_cause, result, audit_info); return result; } int ima_restore_measurement_entry(struct ima_template_entry *entry) { int result = 0; mutex_lock(&ima_extend_list_mutex); result = ima_add_digest_entry(entry, 0); mutex_unlock(&ima_extend_list_mutex); return result; } static void ima_measurements_suspend(void) { mutex_lock(&ima_extend_list_mutex); ima_measurements_suspended = true; mutex_unlock(&ima_extend_list_mutex); } static int ima_reboot_notifier(struct notifier_block *nb, unsigned long action, void *data) { #ifdef CONFIG_IMA_KEXEC if (action == SYS_RESTART && data && !strcmp(data, "kexec reboot")) ima_measure_kexec_event("kexec_execute"); #endif ima_measurements_suspend(); return NOTIFY_DONE; } static struct notifier_block ima_reboot_nb = { .notifier_call = ima_reboot_notifier, }; void __init ima_init_reboot_notifier(void) { register_reboot_notifier(&ima_reboot_nb); } int __init ima_init_digests(void) { u16 digest_size; u16 crypto_id; int i; if (!ima_tpm_chip) return 0; digests = kcalloc(ima_tpm_chip->nr_allocated_banks, sizeof(*digests), GFP_NOFS); if (!digests) return -ENOMEM; for (i = 0; i < ima_tpm_chip->nr_allocated_banks; i++) { digests[i].alg_id = ima_tpm_chip->allocated_banks[i].alg_id; digest_size = ima_tpm_chip->allocated_banks[i].digest_size; crypto_id = ima_tpm_chip->allocated_banks[i].crypto_id; /* for unmapped TPM algorithms digest is still a padded SHA1 */ if (crypto_id == HASH_ALGO__LAST) digest_size = SHA1_DIGEST_SIZE; memset(digests[i].digest, 0xff, digest_size); } return 0; }
11 1 14 10 5 5 7 2 6 3 4 5 4 2 12 11 5 2 2 1 4 8 8 10 14 1 1 14 10 10 14 20 1 6 14 6 4 10 7 13 4 11 5 10 7 5 1 12 1 13 13 10 6 3 1 1 1 24 18 17 1 10 7 26 8 18 25 20 20 20 14 10 3 20 8 20 1 20 20 20 20 9 18 1 10 8 4 1 5 4 4 12 9 3 10 2 9 3 10 2 10 2 9 2 9 3 10 2 10 2 10 2 10 2 4 4 34 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 // SPDX-License-Identifier: GPL-2.0-only /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> * (C) 2002-2013 Jozsef Kadlecsik <kadlec@netfilter.org> * (C) 2006-2012 Patrick McHardy <kaber@trash.net> */ #include <linux/types.h> #include <linux/timer.h> #include <linux/module.h> #include <linux/in.h> #include <linux/tcp.h> #include <linux/spinlock.h> #include <linux/skbuff.h> #include <linux/ipv6.h> #include <net/ip6_checksum.h> #include <linux/unaligned.h> #include <net/tcp.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/nf_conntrack_synproxy.h> #include <net/netfilter/nf_conntrack_timeout.h> #include <net/netfilter/nf_log.h> #include <net/netfilter/ipv4/nf_conntrack_ipv4.h> #include <net/netfilter/ipv6/nf_conntrack_ipv6.h> /* FIXME: Examine ipfilter's timeouts and conntrack transitions more closely. They're more complex. --RR */ static const char *const tcp_conntrack_names[] = { "NONE", "SYN_SENT", "SYN_RECV", "ESTABLISHED", "FIN_WAIT", "CLOSE_WAIT", "LAST_ACK", "TIME_WAIT", "CLOSE", "SYN_SENT2", }; enum nf_ct_tcp_action { NFCT_TCP_IGNORE, NFCT_TCP_INVALID, NFCT_TCP_ACCEPT, }; #define SECS * HZ #define MINS * 60 SECS #define HOURS * 60 MINS #define DAYS * 24 HOURS static const unsigned int tcp_timeouts[TCP_CONNTRACK_TIMEOUT_MAX] = { [TCP_CONNTRACK_SYN_SENT] = 2 MINS, [TCP_CONNTRACK_SYN_RECV] = 60 SECS, [TCP_CONNTRACK_ESTABLISHED] = 5 DAYS, [TCP_CONNTRACK_FIN_WAIT] = 2 MINS, [TCP_CONNTRACK_CLOSE_WAIT] = 60 SECS, [TCP_CONNTRACK_LAST_ACK] = 30 SECS, [TCP_CONNTRACK_TIME_WAIT] = 2 MINS, [TCP_CONNTRACK_CLOSE] = 10 SECS, [TCP_CONNTRACK_SYN_SENT2] = 2 MINS, /* RFC1122 says the R2 limit should be at least 100 seconds. Linux uses 15 packets as limit, which corresponds to ~13-30min depending on RTO. */ [TCP_CONNTRACK_RETRANS] = 5 MINS, [TCP_CONNTRACK_UNACK] = 5 MINS, }; #define sNO TCP_CONNTRACK_NONE #define sSS TCP_CONNTRACK_SYN_SENT #define sSR TCP_CONNTRACK_SYN_RECV #define sES TCP_CONNTRACK_ESTABLISHED #define sFW TCP_CONNTRACK_FIN_WAIT #define sCW TCP_CONNTRACK_CLOSE_WAIT #define sLA TCP_CONNTRACK_LAST_ACK #define sTW TCP_CONNTRACK_TIME_WAIT #define sCL TCP_CONNTRACK_CLOSE #define sS2 TCP_CONNTRACK_SYN_SENT2 #define sIV TCP_CONNTRACK_MAX #define sIG TCP_CONNTRACK_IGNORE /* What TCP flags are set from RST/SYN/FIN/ACK. */ enum tcp_bit_set { TCP_SYN_SET, TCP_SYNACK_SET, TCP_FIN_SET, TCP_ACK_SET, TCP_RST_SET, TCP_NONE_SET, }; /* * The TCP state transition table needs a few words... * * We are the man in the middle. All the packets go through us * but might get lost in transit to the destination. * It is assumed that the destinations can't receive segments * we haven't seen. * * The checked segment is in window, but our windows are *not* * equivalent with the ones of the sender/receiver. We always * try to guess the state of the current sender. * * The meaning of the states are: * * NONE: initial state * SYN_SENT: SYN-only packet seen * SYN_SENT2: SYN-only packet seen from reply dir, simultaneous open * SYN_RECV: SYN-ACK packet seen * ESTABLISHED: ACK packet seen * FIN_WAIT: FIN packet seen * CLOSE_WAIT: ACK seen (after FIN) * LAST_ACK: FIN seen (after FIN) * TIME_WAIT: last ACK seen * CLOSE: closed connection (RST) * * Packets marked as IGNORED (sIG): * if they may be either invalid or valid * and the receiver may send back a connection * closing RST or a SYN/ACK. * * Packets marked as INVALID (sIV): * if we regard them as truly invalid packets */ static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { { /* ORIGINAL */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*syn*/ { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sS2 }, /* * sNO -> sSS Initialize a new connection * sSS -> sSS Retransmitted SYN * sS2 -> sS2 Late retransmitted SYN * sSR -> sIG * sES -> sIG Error: SYNs in window outside the SYN_SENT state * are errors. Receiver will reply with RST * and close the connection. * Or we are not in sync and hold a dead connection. * sFW -> sIG * sCW -> sIG * sLA -> sIG * sTW -> sSS Reopened connection (RFC 1122). * sCL -> sSS */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*synack*/ { sIV, sIV, sSR, sIV, sIV, sIV, sIV, sIV, sIV, sSR }, /* * sNO -> sIV Too late and no reason to do anything * sSS -> sIV Client can't send SYN and then SYN/ACK * sS2 -> sSR SYN/ACK sent to SYN2 in simultaneous open * sSR -> sSR Late retransmitted SYN/ACK in simultaneous open * sES -> sIV Invalid SYN/ACK packets sent by the client * sFW -> sIV * sCW -> sIV * sLA -> sIV * sTW -> sIV * sCL -> sIV */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV }, /* * sNO -> sIV Too late and no reason to do anything... * sSS -> sIV Client migth not send FIN in this state: * we enforce waiting for a SYN/ACK reply first. * sS2 -> sIV * sSR -> sFW Close started. * sES -> sFW * sFW -> sLA FIN seen in both directions, waiting for * the last ACK. * Migth be a retransmitted FIN as well... * sCW -> sLA * sLA -> sLA Retransmitted FIN. Remain in the same state. * sTW -> sTW * sCL -> sCL */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*ack*/ { sES, sIV, sES, sES, sCW, sCW, sTW, sTW, sCL, sIV }, /* * sNO -> sES Assumed. * sSS -> sIV ACK is invalid: we haven't seen a SYN/ACK yet. * sS2 -> sIV * sSR -> sES Established state is reached. * sES -> sES :-) * sFW -> sCW Normal close request answered by ACK. * sCW -> sCW * sLA -> sTW Last ACK detected (RFC5961 challenged) * sTW -> sTW Retransmitted last ACK. Remain in the same state. * sCL -> sCL */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL }, /*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV } }, { /* REPLY */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*syn*/ { sIV, sS2, sIV, sIV, sIV, sIV, sIV, sSS, sIV, sS2 }, /* * sNO -> sIV Never reached. * sSS -> sS2 Simultaneous open * sS2 -> sS2 Retransmitted simultaneous SYN * sSR -> sIV Invalid SYN packets sent by the server * sES -> sIV * sFW -> sIV * sCW -> sIV * sLA -> sIV * sTW -> sSS Reopened connection, but server may have switched role * sCL -> sIV */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*synack*/ { sIV, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIG, sSR }, /* * sSS -> sSR Standard open. * sS2 -> sSR Simultaneous open * sSR -> sIG Retransmitted SYN/ACK, ignore it. * sES -> sIG Late retransmitted SYN/ACK? * sFW -> sIG Might be SYN/ACK answering ignored SYN * sCW -> sIG * sLA -> sIG * sTW -> sIG * sCL -> sIG */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV }, /* * sSS -> sIV Server might not send FIN in this state. * sS2 -> sIV * sSR -> sFW Close started. * sES -> sFW * sFW -> sLA FIN seen in both directions. * sCW -> sLA * sLA -> sLA Retransmitted FIN. * sTW -> sTW * sCL -> sCL */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*ack*/ { sIV, sIG, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIG }, /* * sSS -> sIG Might be a half-open connection. * sS2 -> sIG * sSR -> sSR Might answer late resent SYN. * sES -> sES :-) * sFW -> sCW Normal close request answered by ACK. * sCW -> sCW * sLA -> sTW Last ACK detected (RFC5961 challenged) * sTW -> sTW Retransmitted last ACK. * sCL -> sCL */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL }, /*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV } } }; #ifdef CONFIG_NF_CONNTRACK_PROCFS /* Print out the private part of the conntrack. */ static void tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct) { if (test_bit(IPS_OFFLOAD_BIT, &ct->status)) return; seq_printf(s, "%s ", tcp_conntrack_names[ct->proto.tcp.state]); } #endif static unsigned int get_conntrack_index(const struct tcphdr *tcph) { if (tcph->rst) return TCP_RST_SET; else if (tcph->syn) return (tcph->ack ? TCP_SYNACK_SET : TCP_SYN_SET); else if (tcph->fin) return TCP_FIN_SET; else if (tcph->ack) return TCP_ACK_SET; else return TCP_NONE_SET; } /* TCP connection tracking based on 'Real Stateful TCP Packet Filtering in IP Filter' by Guido van Rooij. http://www.sane.nl/events/sane2000/papers.html http://www.darkart.com/mirrors/www.obfuscation.org/ipf/ The boundaries and the conditions are changed according to RFC793: the packet must intersect the window (i.e. segments may be after the right or before the left edge) and thus receivers may ACK segments after the right edge of the window. td_maxend = max(sack + max(win,1)) seen in reply packets td_maxwin = max(max(win, 1)) + (sack - ack) seen in sent packets td_maxwin += seq + len - sender.td_maxend if seq + len > sender.td_maxend td_end = max(seq + len) seen in sent packets I. Upper bound for valid data: seq <= sender.td_maxend II. Lower bound for valid data: seq + len >= sender.td_end - receiver.td_maxwin III. Upper bound for valid (s)ack: sack <= receiver.td_end IV. Lower bound for valid (s)ack: sack >= receiver.td_end - MAXACKWINDOW where sack is the highest right edge of sack block found in the packet or ack in the case of packet without SACK option. The upper bound limit for a valid (s)ack is not ignored - we doesn't have to deal with fragments. */ static inline __u32 segment_seq_plus_len(__u32 seq, size_t len, unsigned int dataoff, const struct tcphdr *tcph) { /* XXX Should I use payload length field in IP/IPv6 header ? * - YK */ return (seq + len - dataoff - tcph->doff*4 + (tcph->syn ? 1 : 0) + (tcph->fin ? 1 : 0)); } /* Fixme: what about big packets? */ #define MAXACKWINCONST 66000 #define MAXACKWINDOW(sender) \ ((sender)->td_maxwin > MAXACKWINCONST ? (sender)->td_maxwin \ : MAXACKWINCONST) /* * Simplified tcp_parse_options routine from tcp_input.c */ static void tcp_options(const struct sk_buff *skb, unsigned int dataoff, const struct tcphdr *tcph, struct ip_ct_tcp_state *state) { unsigned char buff[(15 * 4) - sizeof(struct tcphdr)]; const unsigned char *ptr; int length = (tcph->doff*4) - sizeof(struct tcphdr); if (!length) return; ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr), length, buff); if (!ptr) return; state->td_scale = 0; state->flags &= IP_CT_TCP_FLAG_BE_LIBERAL; while (length > 0) { int opcode=*ptr++; int opsize; switch (opcode) { case TCPOPT_EOL: return; case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ length--; continue; default: if (length < 2) return; opsize=*ptr++; if (opsize < 2) /* "silly options" */ return; if (opsize > length) return; /* don't parse partial options */ if (opcode == TCPOPT_SACK_PERM && opsize == TCPOLEN_SACK_PERM) state->flags |= IP_CT_TCP_FLAG_SACK_PERM; else if (opcode == TCPOPT_WINDOW && opsize == TCPOLEN_WINDOW) { state->td_scale = *(u_int8_t *)ptr; if (state->td_scale > TCP_MAX_WSCALE) state->td_scale = TCP_MAX_WSCALE; state->flags |= IP_CT_TCP_FLAG_WINDOW_SCALE; } ptr += opsize - 2; length -= opsize; } } } static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff, const struct tcphdr *tcph, __u32 *sack) { unsigned char buff[(15 * 4) - sizeof(struct tcphdr)]; const unsigned char *ptr; int length = (tcph->doff*4) - sizeof(struct tcphdr); __u32 tmp; if (!length) return; ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr), length, buff); if (!ptr) return; /* Fast path for timestamp-only option */ if (length == TCPOLEN_TSTAMP_ALIGNED && *(__be32 *)ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) return; while (length > 0) { int opcode = *ptr++; int opsize, i; switch (opcode) { case TCPOPT_EOL: return; case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ length--; continue; default: if (length < 2) return; opsize = *ptr++; if (opsize < 2) /* "silly options" */ return; if (opsize > length) return; /* don't parse partial options */ if (opcode == TCPOPT_SACK && opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK) && !((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK)) { for (i = 0; i < (opsize - TCPOLEN_SACK_BASE); i += TCPOLEN_SACK_PERBLOCK) { tmp = get_unaligned_be32((__be32 *)(ptr+i)+1); if (after(tmp, *sack)) *sack = tmp; } return; } ptr += opsize - 2; length -= opsize; } } } static void tcp_init_sender(struct ip_ct_tcp_state *sender, struct ip_ct_tcp_state *receiver, const struct sk_buff *skb, unsigned int dataoff, const struct tcphdr *tcph, u32 end, u32 win, enum ip_conntrack_dir dir) { /* SYN-ACK in reply to a SYN * or SYN from reply direction in simultaneous open. */ sender->td_end = sender->td_maxend = end; sender->td_maxwin = (win == 0 ? 1 : win); tcp_options(skb, dataoff, tcph, sender); /* RFC 1323: * Both sides must send the Window Scale option * to enable window scaling in either direction. */ if (dir == IP_CT_DIR_REPLY && !(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE && receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE)) { sender->td_scale = 0; receiver->td_scale = 0; } } __printf(6, 7) static enum nf_ct_tcp_action nf_tcp_log_invalid(const struct sk_buff *skb, const struct nf_conn *ct, const struct nf_hook_state *state, const struct ip_ct_tcp_state *sender, enum nf_ct_tcp_action ret, const char *fmt, ...) { const struct nf_tcp_net *tn = nf_tcp_pernet(nf_ct_net(ct)); struct va_format vaf; va_list args; bool be_liberal; be_liberal = sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL || tn->tcp_be_liberal; if (be_liberal) return NFCT_TCP_ACCEPT; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; nf_ct_l4proto_log_invalid(skb, ct, state, "%pV", &vaf); va_end(args); return ret; } static enum nf_ct_tcp_action tcp_in_window(struct nf_conn *ct, enum ip_conntrack_dir dir, unsigned int index, const struct sk_buff *skb, unsigned int dataoff, const struct tcphdr *tcph, const struct nf_hook_state *hook_state) { struct ip_ct_tcp *state = &ct->proto.tcp; struct ip_ct_tcp_state *sender = &state->seen[dir]; struct ip_ct_tcp_state *receiver = &state->seen[!dir]; __u32 seq, ack, sack, end, win, swin; bool in_recv_win, seq_ok; s32 receiver_offset; u16 win_raw; /* * Get the required data from the packet. */ seq = ntohl(tcph->seq); ack = sack = ntohl(tcph->ack_seq); win_raw = ntohs(tcph->window); win = win_raw; end = segment_seq_plus_len(seq, skb->len, dataoff, tcph); if (receiver->flags & IP_CT_TCP_FLAG_SACK_PERM) tcp_sack(skb, dataoff, tcph, &sack); /* Take into account NAT sequence number mangling */ receiver_offset = nf_ct_seq_offset(ct, !dir, ack - 1); ack -= receiver_offset; sack -= receiver_offset; if (sender->td_maxwin == 0) { /* * Initialize sender data. */ if (tcph->syn) { tcp_init_sender(sender, receiver, skb, dataoff, tcph, end, win, dir); if (!tcph->ack) /* Simultaneous open */ return NFCT_TCP_ACCEPT; } else { /* * We are in the middle of a connection, * its history is lost for us. * Let's try to use the data from the packet. */ sender->td_end = end; swin = win << sender->td_scale; sender->td_maxwin = (swin == 0 ? 1 : swin); sender->td_maxend = end + sender->td_maxwin; if (receiver->td_maxwin == 0) { /* We haven't seen traffic in the other * direction yet but we have to tweak window * tracking to pass III and IV until that * happens. */ receiver->td_end = receiver->td_maxend = sack; } else if (sack == receiver->td_end + 1) { /* Likely a reply to a keepalive. * Needed for III. */ receiver->td_end++; } } } else if (tcph->syn && after(end, sender->td_end) && (state->state == TCP_CONNTRACK_SYN_SENT || state->state == TCP_CONNTRACK_SYN_RECV)) { /* * RFC 793: "if a TCP is reinitialized ... then it need * not wait at all; it must only be sure to use sequence * numbers larger than those recently used." * * Re-init state for this direction, just like for the first * syn(-ack) reply, it might differ in seq, ack or tcp options. */ tcp_init_sender(sender, receiver, skb, dataoff, tcph, end, win, dir); if (dir == IP_CT_DIR_REPLY && !tcph->ack) return NFCT_TCP_ACCEPT; } if (!(tcph->ack)) { /* * If there is no ACK, just pretend it was set and OK. */ ack = sack = receiver->td_end; } else if (((tcp_flag_word(tcph) & (TCP_FLAG_ACK|TCP_FLAG_RST)) == (TCP_FLAG_ACK|TCP_FLAG_RST)) && (ack == 0)) { /* * Broken TCP stacks, that set ACK in RST packets as well * with zero ack value. */ ack = sack = receiver->td_end; } if (tcph->rst && seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT) /* * RST sent answering SYN. */ seq = end = sender->td_end; seq_ok = before(seq, sender->td_maxend + 1); if (!seq_ok) { u32 overshot = end - sender->td_maxend + 1; bool ack_ok; ack_ok = after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1); in_recv_win = receiver->td_maxwin && after(end, sender->td_end - receiver->td_maxwin - 1); if (in_recv_win && ack_ok && overshot <= receiver->td_maxwin && before(sack, receiver->td_end + 1)) { /* Work around TCPs that send more bytes than allowed by * the receive window. * * If the (marked as invalid) packet is allowed to pass by * the ruleset and the peer acks this data, then its possible * all future packets will trigger 'ACK is over upper bound' check. * * Thus if only the sequence check fails then do update td_end so * possible ACK for this data can update internal state. */ sender->td_end = end; sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED; return nf_tcp_log_invalid(skb, ct, hook_state, sender, NFCT_TCP_IGNORE, "%u bytes more than expected", overshot); } return nf_tcp_log_invalid(skb, ct, hook_state, sender, NFCT_TCP_INVALID, "SEQ is over upper bound %u (over the window of the receiver)", sender->td_maxend + 1); } if (!before(sack, receiver->td_end + 1)) return nf_tcp_log_invalid(skb, ct, hook_state, sender, NFCT_TCP_INVALID, "ACK is over upper bound %u (ACKed data not seen yet)", receiver->td_end + 1); /* Is the ending sequence in the receive window (if available)? */ in_recv_win = !receiver->td_maxwin || after(end, sender->td_end - receiver->td_maxwin - 1); if (!in_recv_win) return nf_tcp_log_invalid(skb, ct, hook_state, sender, NFCT_TCP_IGNORE, "SEQ is under lower bound %u (already ACKed data retransmitted)", sender->td_end - receiver->td_maxwin - 1); if (!after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1)) return nf_tcp_log_invalid(skb, ct, hook_state, sender, NFCT_TCP_IGNORE, "ignored ACK under lower bound %u (possible overly delayed)", receiver->td_end - MAXACKWINDOW(sender) - 1); /* Take into account window scaling (RFC 1323). */ if (!tcph->syn) win <<= sender->td_scale; /* Update sender data. */ swin = win + (sack - ack); if (sender->td_maxwin < swin) sender->td_maxwin = swin; if (after(end, sender->td_end)) { sender->td_end = end; sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED; } if (tcph->ack) { if (!(sender->flags & IP_CT_TCP_FLAG_MAXACK_SET)) { sender->td_maxack = ack; sender->flags |= IP_CT_TCP_FLAG_MAXACK_SET; } else if (after(ack, sender->td_maxack)) { sender->td_maxack = ack; } } /* Update receiver data. */ if (receiver->td_maxwin != 0 && after(end, sender->td_maxend)) receiver->td_maxwin += end - sender->td_maxend; if (after(sack + win, receiver->td_maxend - 1)) { receiver->td_maxend = sack + win; if (win == 0) receiver->td_maxend++; } if (ack == receiver->td_end) receiver->flags &= ~IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED; /* Check retransmissions. */ if (index == TCP_ACK_SET) { if (state->last_dir == dir && state->last_seq == seq && state->last_ack == ack && state->last_end == end && state->last_win == win_raw) { state->retrans++; } else { state->last_dir = dir; state->last_seq = seq; state->last_ack = ack; state->last_end = end; state->last_win = win_raw; state->retrans = 0; } } return NFCT_TCP_ACCEPT; } static void __cold nf_tcp_handle_invalid(struct nf_conn *ct, enum ip_conntrack_dir dir, int index, const struct sk_buff *skb, const struct nf_hook_state *hook_state) { const unsigned int *timeouts; const struct nf_tcp_net *tn; unsigned int timeout; u32 expires; if (!test_bit(IPS_ASSURED_BIT, &ct->status) || test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) return; /* We don't want to have connections hanging around in ESTABLISHED * state for long time 'just because' conntrack deemed a FIN/RST * out-of-window. * * Shrink the timeout just like when there is unacked data. * This speeds up eviction of 'dead' connections where the * connection and conntracks internal state are out of sync. */ switch (index) { case TCP_RST_SET: case TCP_FIN_SET: break; default: return; } if (ct->proto.tcp.last_dir != dir && (ct->proto.tcp.last_index == TCP_FIN_SET || ct->proto.tcp.last_index == TCP_RST_SET)) { expires = nf_ct_expires(ct); if (expires < 120 * HZ) return; tn = nf_tcp_pernet(nf_ct_net(ct)); timeouts = nf_ct_timeout_lookup(ct); if (!timeouts) timeouts = tn->timeouts; timeout = READ_ONCE(timeouts[TCP_CONNTRACK_UNACK]); if (expires > timeout) { nf_ct_l4proto_log_invalid(skb, ct, hook_state, "packet (index %d, dir %d) response for index %d lower timeout to %u", index, dir, ct->proto.tcp.last_index, timeout); WRITE_ONCE(ct->timeout, timeout + nfct_time_stamp); } } else { ct->proto.tcp.last_index = index; ct->proto.tcp.last_dir = dir; } } /* table of valid flag combinations - PUSH, ECE and CWR are always valid */ static const u8 tcp_valid_flags[(TCPHDR_FIN|TCPHDR_SYN|TCPHDR_RST|TCPHDR_ACK| TCPHDR_URG) + 1] = { [TCPHDR_SYN] = 1, [TCPHDR_SYN|TCPHDR_URG] = 1, [TCPHDR_SYN|TCPHDR_ACK] = 1, [TCPHDR_RST] = 1, [TCPHDR_RST|TCPHDR_ACK] = 1, [TCPHDR_FIN|TCPHDR_ACK] = 1, [TCPHDR_FIN|TCPHDR_ACK|TCPHDR_URG] = 1, [TCPHDR_ACK] = 1, [TCPHDR_ACK|TCPHDR_URG] = 1, }; static void tcp_error_log(const struct sk_buff *skb, const struct nf_hook_state *state, const char *msg) { nf_l4proto_log_invalid(skb, state, IPPROTO_TCP, "%s", msg); } /* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c. */ static bool tcp_error(const struct tcphdr *th, struct sk_buff *skb, unsigned int dataoff, const struct nf_hook_state *state) { unsigned int tcplen = skb->len - dataoff; u8 tcpflags; /* Not whole TCP header or malformed packet */ if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) { tcp_error_log(skb, state, "truncated packet"); return true; } /* Checksum invalid? Ignore. * We skip checking packets on the outgoing path * because the checksum is assumed to be correct. */ /* FIXME: Source route IP option packets --RR */ if (state->net->ct.sysctl_checksum && state->hook == NF_INET_PRE_ROUTING && nf_checksum(skb, state->hook, dataoff, IPPROTO_TCP, state->pf)) { tcp_error_log(skb, state, "bad checksum"); return true; } /* Check TCP flags. */ tcpflags = (tcp_flag_byte(th) & ~(TCPHDR_ECE|TCPHDR_CWR|TCPHDR_PSH)); if (!tcp_valid_flags[tcpflags]) { tcp_error_log(skb, state, "invalid tcp flag combination"); return true; } return false; } static noinline bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, const struct tcphdr *th, const struct nf_hook_state *state) { enum tcp_conntrack new_state; struct net *net = nf_ct_net(ct); const struct nf_tcp_net *tn = nf_tcp_pernet(net); /* Don't need lock here: this conntrack not in circulation yet */ new_state = tcp_conntracks[0][get_conntrack_index(th)][TCP_CONNTRACK_NONE]; /* Invalid: delete conntrack */ if (new_state >= TCP_CONNTRACK_MAX) { tcp_error_log(skb, state, "invalid new"); return false; } if (new_state == TCP_CONNTRACK_SYN_SENT) { memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp)); /* SYN packet */ ct->proto.tcp.seen[0].td_end = segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th); ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window); if (ct->proto.tcp.seen[0].td_maxwin == 0) ct->proto.tcp.seen[0].td_maxwin = 1; ct->proto.tcp.seen[0].td_maxend = ct->proto.tcp.seen[0].td_end; tcp_options(skb, dataoff, th, &ct->proto.tcp.seen[0]); } else if (tn->tcp_loose == 0) { /* Don't try to pick up connections. */ return false; } else { memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp)); /* * We are in the middle of a connection, * its history is lost for us. * Let's try to use the data from the packet. */ ct->proto.tcp.seen[0].td_end = segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th); ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window); if (ct->proto.tcp.seen[0].td_maxwin == 0) ct->proto.tcp.seen[0].td_maxwin = 1; ct->proto.tcp.seen[0].td_maxend = ct->proto.tcp.seen[0].td_end + ct->proto.tcp.seen[0].td_maxwin; /* We assume SACK and liberal window checking to handle * window scaling */ ct->proto.tcp.seen[0].flags = ct->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM | IP_CT_TCP_FLAG_BE_LIBERAL; } /* tcp_packet will set them */ ct->proto.tcp.last_index = TCP_NONE_SET; return true; } static bool tcp_can_early_drop(const struct nf_conn *ct) { switch (ct->proto.tcp.state) { case TCP_CONNTRACK_FIN_WAIT: case TCP_CONNTRACK_LAST_ACK: case TCP_CONNTRACK_TIME_WAIT: case TCP_CONNTRACK_CLOSE: case TCP_CONNTRACK_CLOSE_WAIT: return true; default: break; } return false; } void nf_conntrack_tcp_set_closing(struct nf_conn *ct) { enum tcp_conntrack old_state; const unsigned int *timeouts; u32 timeout; if (!nf_ct_is_confirmed(ct)) return; spin_lock_bh(&ct->lock); old_state = ct->proto.tcp.state; ct->proto.tcp.state = TCP_CONNTRACK_CLOSE; if (old_state == TCP_CONNTRACK_CLOSE || test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) { spin_unlock_bh(&ct->lock); return; } timeouts = nf_ct_timeout_lookup(ct); if (!timeouts) { const struct nf_tcp_net *tn; tn = nf_tcp_pernet(nf_ct_net(ct)); timeouts = tn->timeouts; } timeout = timeouts[TCP_CONNTRACK_CLOSE]; WRITE_ONCE(ct->timeout, timeout + nfct_time_stamp); spin_unlock_bh(&ct->lock); nf_conntrack_event_cache(IPCT_PROTOINFO, ct); } static void nf_ct_tcp_state_reset(struct ip_ct_tcp_state *state) { state->td_end = 0; state->td_maxend = 0; state->td_maxwin = 0; state->td_maxack = 0; state->td_scale = 0; state->flags &= IP_CT_TCP_FLAG_BE_LIBERAL; } /* Returns verdict for packet, or -1 for invalid. */ int nf_conntrack_tcp_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state) { struct net *net = nf_ct_net(ct); struct nf_tcp_net *tn = nf_tcp_pernet(net); enum tcp_conntrack new_state, old_state; unsigned int index, *timeouts; enum nf_ct_tcp_action res; enum ip_conntrack_dir dir; const struct tcphdr *th; struct tcphdr _tcph; unsigned long timeout; th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph); if (th == NULL) return -NF_ACCEPT; if (tcp_error(th, skb, dataoff, state)) return -NF_ACCEPT; if (!nf_ct_is_confirmed(ct) && !tcp_new(ct, skb, dataoff, th, state)) return -NF_ACCEPT; spin_lock_bh(&ct->lock); old_state = ct->proto.tcp.state; dir = CTINFO2DIR(ctinfo); index = get_conntrack_index(th); new_state = tcp_conntracks[dir][index][old_state]; switch (new_state) { case TCP_CONNTRACK_SYN_SENT: if (old_state < TCP_CONNTRACK_TIME_WAIT) break; /* RFC 1122: "When a connection is closed actively, * it MUST linger in TIME-WAIT state for a time 2xMSL * (Maximum Segment Lifetime). However, it MAY accept * a new SYN from the remote TCP to reopen the connection * directly from TIME-WAIT state, if..." * We ignore the conditions because we are in the * TIME-WAIT state anyway. * * Handle aborted connections: we and the server * think there is an existing connection but the client * aborts it and starts a new one. */ if (((ct->proto.tcp.seen[dir].flags | ct->proto.tcp.seen[!dir].flags) & IP_CT_TCP_FLAG_CLOSE_INIT) || (ct->proto.tcp.last_dir == dir && ct->proto.tcp.last_index == TCP_RST_SET)) { /* Attempt to reopen a closed/aborted connection. * Delete this connection and look up again. */ spin_unlock_bh(&ct->lock); /* Only repeat if we can actually remove the timer. * Destruction may already be in progress in process * context and we must give it a chance to terminate. */ if (nf_ct_kill(ct)) return -NF_REPEAT; return NF_DROP; } fallthrough; case TCP_CONNTRACK_IGNORE: /* Ignored packets: * * Our connection entry may be out of sync, so ignore * packets which may signal the real connection between * the client and the server. * * a) SYN in ORIGINAL * b) SYN/ACK in REPLY * c) ACK in reply direction after initial SYN in original. * * If the ignored packet is invalid, the receiver will send * a RST we'll catch below. */ if (index == TCP_SYNACK_SET && ct->proto.tcp.last_index == TCP_SYN_SET && ct->proto.tcp.last_dir != dir && ntohl(th->ack_seq) == ct->proto.tcp.last_end) { /* b) This SYN/ACK acknowledges a SYN that we earlier * ignored as invalid. This means that the client and * the server are both in sync, while the firewall is * not. We get in sync from the previously annotated * values. */ old_state = TCP_CONNTRACK_SYN_SENT; new_state = TCP_CONNTRACK_SYN_RECV; ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_end = ct->proto.tcp.last_end; ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_maxend = ct->proto.tcp.last_end; ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_maxwin = ct->proto.tcp.last_win == 0 ? 1 : ct->proto.tcp.last_win; ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_scale = ct->proto.tcp.last_wscale; ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK; ct->proto.tcp.seen[ct->proto.tcp.last_dir].flags = ct->proto.tcp.last_flags; nf_ct_tcp_state_reset(&ct->proto.tcp.seen[dir]); break; } ct->proto.tcp.last_index = index; ct->proto.tcp.last_dir = dir; ct->proto.tcp.last_seq = ntohl(th->seq); ct->proto.tcp.last_end = segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th); ct->proto.tcp.last_win = ntohs(th->window); /* a) This is a SYN in ORIGINAL. The client and the server * may be in sync but we are not. In that case, we annotate * the TCP options and let the packet go through. If it is a * valid SYN packet, the server will reply with a SYN/ACK, and * then we'll get in sync. Otherwise, the server potentially * responds with a challenge ACK if implementing RFC5961. */ if (index == TCP_SYN_SET && dir == IP_CT_DIR_ORIGINAL) { struct ip_ct_tcp_state seen = {}; ct->proto.tcp.last_flags = ct->proto.tcp.last_wscale = 0; tcp_options(skb, dataoff, th, &seen); if (seen.flags & IP_CT_TCP_FLAG_WINDOW_SCALE) { ct->proto.tcp.last_flags |= IP_CT_TCP_FLAG_WINDOW_SCALE; ct->proto.tcp.last_wscale = seen.td_scale; } if (seen.flags & IP_CT_TCP_FLAG_SACK_PERM) { ct->proto.tcp.last_flags |= IP_CT_TCP_FLAG_SACK_PERM; } /* Mark the potential for RFC5961 challenge ACK, * this pose a special problem for LAST_ACK state * as ACK is intrepretated as ACKing last FIN. */ if (old_state == TCP_CONNTRACK_LAST_ACK) ct->proto.tcp.last_flags |= IP_CT_EXP_CHALLENGE_ACK; } /* possible challenge ack reply to syn */ if (old_state == TCP_CONNTRACK_SYN_SENT && index == TCP_ACK_SET && dir == IP_CT_DIR_REPLY) ct->proto.tcp.last_ack = ntohl(th->ack_seq); spin_unlock_bh(&ct->lock); nf_ct_l4proto_log_invalid(skb, ct, state, "packet (index %d) in dir %d ignored, state %s", index, dir, tcp_conntrack_names[old_state]); return NF_ACCEPT; case TCP_CONNTRACK_MAX: /* Special case for SYN proxy: when the SYN to the server or * the SYN/ACK from the server is lost, the client may transmit * a keep-alive packet while in SYN_SENT state. This needs to * be associated with the original conntrack entry in order to * generate a new SYN with the correct sequence number. */ if (nfct_synproxy(ct) && old_state == TCP_CONNTRACK_SYN_SENT && index == TCP_ACK_SET && dir == IP_CT_DIR_ORIGINAL && ct->proto.tcp.last_dir == IP_CT_DIR_ORIGINAL && ct->proto.tcp.seen[dir].td_end - 1 == ntohl(th->seq)) { pr_debug("nf_ct_tcp: SYN proxy client keep alive\n"); spin_unlock_bh(&ct->lock); return NF_ACCEPT; } /* Invalid packet */ spin_unlock_bh(&ct->lock); nf_ct_l4proto_log_invalid(skb, ct, state, "packet (index %d) in dir %d invalid, state %s", index, dir, tcp_conntrack_names[old_state]); return -NF_ACCEPT; case TCP_CONNTRACK_TIME_WAIT: /* RFC5961 compliance cause stack to send "challenge-ACK" * e.g. in response to spurious SYNs. Conntrack MUST * not believe this ACK is acking last FIN. */ if (old_state == TCP_CONNTRACK_LAST_ACK && index == TCP_ACK_SET && ct->proto.tcp.last_dir != dir && ct->proto.tcp.last_index == TCP_SYN_SET && (ct->proto.tcp.last_flags & IP_CT_EXP_CHALLENGE_ACK)) { /* Detected RFC5961 challenge ACK */ ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK; spin_unlock_bh(&ct->lock); nf_ct_l4proto_log_invalid(skb, ct, state, "challenge-ack ignored"); return NF_ACCEPT; /* Don't change state */ } break; case TCP_CONNTRACK_SYN_SENT2: /* tcp_conntracks table is not smart enough to handle * simultaneous open. */ ct->proto.tcp.last_flags |= IP_CT_TCP_SIMULTANEOUS_OPEN; break; case TCP_CONNTRACK_SYN_RECV: if (dir == IP_CT_DIR_REPLY && index == TCP_ACK_SET && ct->proto.tcp.last_flags & IP_CT_TCP_SIMULTANEOUS_OPEN) new_state = TCP_CONNTRACK_ESTABLISHED; break; case TCP_CONNTRACK_CLOSE: if (index != TCP_RST_SET) break; /* If we are closing, tuple might have been re-used already. * last_index, last_ack, and all other ct fields used for * sequence/window validation are outdated in that case. * * As the conntrack can already be expired by GC under pressure, * just skip validation checks. */ if (tcp_can_early_drop(ct)) goto in_window; /* td_maxack might be outdated if we let a SYN through earlier */ if ((ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET) && ct->proto.tcp.last_index != TCP_SYN_SET) { u32 seq = ntohl(th->seq); /* If we are not in established state and SEQ=0 this is most * likely an answer to a SYN we let go through above (last_index * can be updated due to out-of-order ACKs). */ if (seq == 0 && !nf_conntrack_tcp_established(ct)) break; if (before(seq, ct->proto.tcp.seen[!dir].td_maxack) && !tn->tcp_ignore_invalid_rst) { /* Invalid RST */ spin_unlock_bh(&ct->lock); nf_ct_l4proto_log_invalid(skb, ct, state, "invalid rst"); return -NF_ACCEPT; } if (!nf_conntrack_tcp_established(ct) || seq == ct->proto.tcp.seen[!dir].td_maxack) break; /* Check if rst is part of train, such as * foo:80 > bar:4379: P, 235946583:235946602(19) ack 42 * foo:80 > bar:4379: R, 235946602:235946602(0) ack 42 */ if (ct->proto.tcp.last_index == TCP_ACK_SET && ct->proto.tcp.last_dir == dir && seq == ct->proto.tcp.last_end) break; /* ... RST sequence number doesn't match exactly, keep * established state to allow a possible challenge ACK. */ new_state = old_state; } if (((test_bit(IPS_SEEN_REPLY_BIT, &ct->status) && ct->proto.tcp.last_index == TCP_SYN_SET) || (!test_bit(IPS_ASSURED_BIT, &ct->status) && ct->proto.tcp.last_index == TCP_ACK_SET)) && ntohl(th->ack_seq) == ct->proto.tcp.last_end) { /* RST sent to invalid SYN or ACK we had let through * at a) and c) above: * * a) SYN was in window then * c) we hold a half-open connection. * * Delete our connection entry. * We skip window checking, because packet might ACK * segments we ignored. */ goto in_window; } /* Reset in response to a challenge-ack we let through earlier */ if (old_state == TCP_CONNTRACK_SYN_SENT && ct->proto.tcp.last_index == TCP_ACK_SET && ct->proto.tcp.last_dir == IP_CT_DIR_REPLY && ntohl(th->seq) == ct->proto.tcp.last_ack) goto in_window; break; default: /* Keep compilers happy. */ break; } res = tcp_in_window(ct, dir, index, skb, dataoff, th, state); switch (res) { case NFCT_TCP_IGNORE: spin_unlock_bh(&ct->lock); return NF_ACCEPT; case NFCT_TCP_INVALID: nf_tcp_handle_invalid(ct, dir, index, skb, state); spin_unlock_bh(&ct->lock); return -NF_ACCEPT; case NFCT_TCP_ACCEPT: break; } in_window: /* From now on we have got in-window packets */ ct->proto.tcp.last_index = index; ct->proto.tcp.last_dir = dir; ct->proto.tcp.state = new_state; if (old_state != new_state && new_state == TCP_CONNTRACK_FIN_WAIT) ct->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT; timeouts = nf_ct_timeout_lookup(ct); if (!timeouts) timeouts = tn->timeouts; if (ct->proto.tcp.retrans >= tn->tcp_max_retrans && timeouts[new_state] > timeouts[TCP_CONNTRACK_RETRANS]) timeout = timeouts[TCP_CONNTRACK_RETRANS]; else if (unlikely(index == TCP_RST_SET)) timeout = timeouts[TCP_CONNTRACK_CLOSE]; else if ((ct->proto.tcp.seen[0].flags | ct->proto.tcp.seen[1].flags) & IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED && timeouts[new_state] > timeouts[TCP_CONNTRACK_UNACK]) timeout = timeouts[TCP_CONNTRACK_UNACK]; else if (ct->proto.tcp.last_win == 0 && timeouts[new_state] > timeouts[TCP_CONNTRACK_RETRANS]) timeout = timeouts[TCP_CONNTRACK_RETRANS]; else timeout = timeouts[new_state]; spin_unlock_bh(&ct->lock); if (new_state != old_state) nf_conntrack_event_cache(IPCT_PROTOINFO, ct); if (!test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { /* If only reply is a RST, we can consider ourselves not to have an established connection: this is a fairly common problem case, so we can delete the conntrack immediately. --RR */ if (th->rst) { nf_ct_kill_acct(ct, ctinfo, skb); return NF_ACCEPT; } if (index == TCP_SYN_SET && old_state == TCP_CONNTRACK_SYN_SENT) { /* do not renew timeout on SYN retransmit. * * Else port reuse by client or NAT middlebox can keep * entry alive indefinitely (including nat info). */ return NF_ACCEPT; } /* ESTABLISHED without SEEN_REPLY, i.e. mid-connection * pickup with loose=1. Avoid large ESTABLISHED timeout. */ if (new_state == TCP_CONNTRACK_ESTABLISHED && timeout > timeouts[TCP_CONNTRACK_UNACK]) timeout = timeouts[TCP_CONNTRACK_UNACK]; } else if (!test_bit(IPS_ASSURED_BIT, &ct->status) && (old_state == TCP_CONNTRACK_SYN_RECV || old_state == TCP_CONNTRACK_ESTABLISHED) && new_state == TCP_CONNTRACK_ESTABLISHED) { /* Set ASSURED if we see valid ack in ESTABLISHED after SYN_RECV or a valid answer for a picked up connection. */ set_bit(IPS_ASSURED_BIT, &ct->status); nf_conntrack_event_cache(IPCT_ASSURED, ct); } nf_ct_refresh_acct(ct, ctinfo, skb, timeout); return NF_ACCEPT; } #if IS_ENABLED(CONFIG_NF_CT_NETLINK) #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, struct nf_conn *ct, bool destroy) { struct nlattr *nest_parms; struct nf_ct_tcp_flags tmp = {}; spin_lock_bh(&ct->lock); nest_parms = nla_nest_start(skb, CTA_PROTOINFO_TCP); if (!nest_parms) goto nla_put_failure; if (nla_put_u8(skb, CTA_PROTOINFO_TCP_STATE, ct->proto.tcp.state)) goto nla_put_failure; if (destroy) goto skip_state; if (nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_ORIGINAL, ct->proto.tcp.seen[0].td_scale) || nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_REPLY, ct->proto.tcp.seen[1].td_scale)) goto nla_put_failure; tmp.flags = ct->proto.tcp.seen[0].flags; if (nla_put(skb, CTA_PROTOINFO_TCP_FLAGS_ORIGINAL, sizeof(struct nf_ct_tcp_flags), &tmp)) goto nla_put_failure; tmp.flags = ct->proto.tcp.seen[1].flags; if (nla_put(skb, CTA_PROTOINFO_TCP_FLAGS_REPLY, sizeof(struct nf_ct_tcp_flags), &tmp)) goto nla_put_failure; skip_state: spin_unlock_bh(&ct->lock); nla_nest_end(skb, nest_parms); return 0; nla_put_failure: spin_unlock_bh(&ct->lock); return -1; } static const struct nla_policy tcp_nla_policy[CTA_PROTOINFO_TCP_MAX+1] = { [CTA_PROTOINFO_TCP_STATE] = { .type = NLA_U8 }, [CTA_PROTOINFO_TCP_WSCALE_ORIGINAL] = { .type = NLA_U8 }, [CTA_PROTOINFO_TCP_WSCALE_REPLY] = { .type = NLA_U8 }, [CTA_PROTOINFO_TCP_FLAGS_ORIGINAL] = { .len = sizeof(struct nf_ct_tcp_flags) }, [CTA_PROTOINFO_TCP_FLAGS_REPLY] = { .len = sizeof(struct nf_ct_tcp_flags) }, }; #define TCP_NLATTR_SIZE ( \ NLA_ALIGN(NLA_HDRLEN + 1) + \ NLA_ALIGN(NLA_HDRLEN + 1) + \ NLA_ALIGN(NLA_HDRLEN + sizeof(struct nf_ct_tcp_flags)) + \ NLA_ALIGN(NLA_HDRLEN + sizeof(struct nf_ct_tcp_flags))) static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct) { struct nlattr *pattr = cda[CTA_PROTOINFO_TCP]; struct nlattr *tb[CTA_PROTOINFO_TCP_MAX+1]; int err; /* updates could not contain anything about the private * protocol info, in that case skip the parsing */ if (!pattr) return 0; err = nla_parse_nested_deprecated(tb, CTA_PROTOINFO_TCP_MAX, pattr, tcp_nla_policy, NULL); if (err < 0) return err; if (tb[CTA_PROTOINFO_TCP_STATE] && nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]) >= TCP_CONNTRACK_MAX) return -EINVAL; spin_lock_bh(&ct->lock); if (tb[CTA_PROTOINFO_TCP_STATE]) ct->proto.tcp.state = nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]); if (tb[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]) { struct nf_ct_tcp_flags *attr = nla_data(tb[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]); ct->proto.tcp.seen[0].flags &= ~attr->mask; ct->proto.tcp.seen[0].flags |= attr->flags & attr->mask; } if (tb[CTA_PROTOINFO_TCP_FLAGS_REPLY]) { struct nf_ct_tcp_flags *attr = nla_data(tb[CTA_PROTOINFO_TCP_FLAGS_REPLY]); ct->proto.tcp.seen[1].flags &= ~attr->mask; ct->proto.tcp.seen[1].flags |= attr->flags & attr->mask; } if (tb[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL] && tb[CTA_PROTOINFO_TCP_WSCALE_REPLY] && ct->proto.tcp.seen[0].flags & IP_CT_TCP_FLAG_WINDOW_SCALE && ct->proto.tcp.seen[1].flags & IP_CT_TCP_FLAG_WINDOW_SCALE) { ct->proto.tcp.seen[0].td_scale = nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL]); ct->proto.tcp.seen[1].td_scale = nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_REPLY]); } spin_unlock_bh(&ct->lock); return 0; } static unsigned int tcp_nlattr_tuple_size(void) { static unsigned int size __read_mostly; if (!size) size = nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1); return size; } #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_cttimeout.h> static int tcp_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { struct nf_tcp_net *tn = nf_tcp_pernet(net); unsigned int *timeouts = data; int i; if (!timeouts) timeouts = tn->timeouts; /* set default TCP timeouts. */ for (i=0; i<TCP_CONNTRACK_TIMEOUT_MAX; i++) timeouts[i] = tn->timeouts[i]; if (tb[CTA_TIMEOUT_TCP_SYN_SENT]) { timeouts[TCP_CONNTRACK_SYN_SENT] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_SENT]))*HZ; } if (tb[CTA_TIMEOUT_TCP_SYN_RECV]) { timeouts[TCP_CONNTRACK_SYN_RECV] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_RECV]))*HZ; } if (tb[CTA_TIMEOUT_TCP_ESTABLISHED]) { timeouts[TCP_CONNTRACK_ESTABLISHED] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_ESTABLISHED]))*HZ; } if (tb[CTA_TIMEOUT_TCP_FIN_WAIT]) { timeouts[TCP_CONNTRACK_FIN_WAIT] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_FIN_WAIT]))*HZ; } if (tb[CTA_TIMEOUT_TCP_CLOSE_WAIT]) { timeouts[TCP_CONNTRACK_CLOSE_WAIT] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_CLOSE_WAIT]))*HZ; } if (tb[CTA_TIMEOUT_TCP_LAST_ACK]) { timeouts[TCP_CONNTRACK_LAST_ACK] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_LAST_ACK]))*HZ; } if (tb[CTA_TIMEOUT_TCP_TIME_WAIT]) { timeouts[TCP_CONNTRACK_TIME_WAIT] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_TIME_WAIT]))*HZ; } if (tb[CTA_TIMEOUT_TCP_CLOSE]) { timeouts[TCP_CONNTRACK_CLOSE] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_CLOSE]))*HZ; } if (tb[CTA_TIMEOUT_TCP_SYN_SENT2]) { timeouts[TCP_CONNTRACK_SYN_SENT2] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_SENT2]))*HZ; } if (tb[CTA_TIMEOUT_TCP_RETRANS]) { timeouts[TCP_CONNTRACK_RETRANS] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_RETRANS]))*HZ; } if (tb[CTA_TIMEOUT_TCP_UNACK]) { timeouts[TCP_CONNTRACK_UNACK] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_UNACK]))*HZ; } timeouts[CTA_TIMEOUT_TCP_UNSPEC] = timeouts[CTA_TIMEOUT_TCP_SYN_SENT]; return 0; } static int tcp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) { const unsigned int *timeouts = data; if (nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_SENT, htonl(timeouts[TCP_CONNTRACK_SYN_SENT] / HZ)) || nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_RECV, htonl(timeouts[TCP_CONNTRACK_SYN_RECV] / HZ)) || nla_put_be32(skb, CTA_TIMEOUT_TCP_ESTABLISHED, htonl(timeouts[TCP_CONNTRACK_ESTABLISHED] / HZ)) || nla_put_be32(skb, CTA_TIMEOUT_TCP_FIN_WAIT, htonl(timeouts[TCP_CONNTRACK_FIN_WAIT] / HZ)) || nla_put_be32(skb, CTA_TIMEOUT_TCP_CLOSE_WAIT, htonl(timeouts[TCP_CONNTRACK_CLOSE_WAIT] / HZ)) || nla_put_be32(skb, CTA_TIMEOUT_TCP_LAST_ACK, htonl(timeouts[TCP_CONNTRACK_LAST_ACK] / HZ)) || nla_put_be32(skb, CTA_TIMEOUT_TCP_TIME_WAIT, htonl(timeouts[TCP_CONNTRACK_TIME_WAIT] / HZ)) || nla_put_be32(skb, CTA_TIMEOUT_TCP_CLOSE, htonl(timeouts[TCP_CONNTRACK_CLOSE] / HZ)) || nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_SENT2, htonl(timeouts[TCP_CONNTRACK_SYN_SENT2] / HZ)) || nla_put_be32(skb, CTA_TIMEOUT_TCP_RETRANS, htonl(timeouts[TCP_CONNTRACK_RETRANS] / HZ)) || nla_put_be32(skb, CTA_TIMEOUT_TCP_UNACK, htonl(timeouts[TCP_CONNTRACK_UNACK] / HZ))) goto nla_put_failure; return 0; nla_put_failure: return -ENOSPC; } static const struct nla_policy tcp_timeout_nla_policy[CTA_TIMEOUT_TCP_MAX+1] = { [CTA_TIMEOUT_TCP_SYN_SENT] = { .type = NLA_U32 }, [CTA_TIMEOUT_TCP_SYN_RECV] = { .type = NLA_U32 }, [CTA_TIMEOUT_TCP_ESTABLISHED] = { .type = NLA_U32 }, [CTA_TIMEOUT_TCP_FIN_WAIT] = { .type = NLA_U32 }, [CTA_TIMEOUT_TCP_CLOSE_WAIT] = { .type = NLA_U32 }, [CTA_TIMEOUT_TCP_LAST_ACK] = { .type = NLA_U32 }, [CTA_TIMEOUT_TCP_TIME_WAIT] = { .type = NLA_U32 }, [CTA_TIMEOUT_TCP_CLOSE] = { .type = NLA_U32 }, [CTA_TIMEOUT_TCP_SYN_SENT2] = { .type = NLA_U32 }, [CTA_TIMEOUT_TCP_RETRANS] = { .type = NLA_U32 }, [CTA_TIMEOUT_TCP_UNACK] = { .type = NLA_U32 }, }; #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ void nf_conntrack_tcp_init_net(struct net *net) { struct nf_tcp_net *tn = nf_tcp_pernet(net); int i; for (i = 0; i < TCP_CONNTRACK_TIMEOUT_MAX; i++) tn->timeouts[i] = tcp_timeouts[i]; /* timeouts[0] is unused, make it same as SYN_SENT so * ->timeouts[0] contains 'new' timeout, like udp or icmp. */ tn->timeouts[0] = tcp_timeouts[TCP_CONNTRACK_SYN_SENT]; /* If it is set to zero, we disable picking up already established * connections. */ tn->tcp_loose = 1; /* "Be conservative in what you do, * be liberal in what you accept from others." * If it's non-zero, we mark only out of window RST segments as INVALID. */ tn->tcp_be_liberal = 0; /* If it's non-zero, we turn off RST sequence number check */ tn->tcp_ignore_invalid_rst = 0; /* Max number of the retransmitted packets without receiving an (acceptable) * ACK from the destination. If this number is reached, a shorter timer * will be started. */ tn->tcp_max_retrans = 3; #if IS_ENABLED(CONFIG_NF_FLOW_TABLE) tn->offload_timeout = 30 * HZ; #endif } const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp = { .l4proto = IPPROTO_TCP, #ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = tcp_print_conntrack, #endif .can_early_drop = tcp_can_early_drop, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .to_nlattr = tcp_to_nlattr, .from_nlattr = nlattr_to_tcp, .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, .nlattr_tuple_size = tcp_nlattr_tuple_size, .nlattr_size = TCP_NLATTR_SIZE, .nla_policy = nf_ct_port_nla_policy, #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT .ctnl_timeout = { .nlattr_to_obj = tcp_timeout_nlattr_to_obj, .obj_to_nlattr = tcp_timeout_obj_to_nlattr, .nlattr_max = CTA_TIMEOUT_TCP_MAX, .obj_size = sizeof(unsigned int) * TCP_CONNTRACK_TIMEOUT_MAX, .nla_policy = tcp_timeout_nla_policy, }, #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ };
21 11 11 9 3 1 2 11 11 11 11 3 11 10 11 11 11 11 11 11 9 9 11 11 1 11 9 11 11 9 9 11 1 11 8 11 11 8 8 8 11 11 11 10 11 11 11 11 10 11 11 22 514 143 495 82 494 513 512 85 81 81 85 1243 559 775 96 8 101 142 514 503 201 110 262 322 322 177 420 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 /* * Copyright (c) 2015, Mellanox Technologies inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "core_priv.h" #include <linux/in.h> #include <linux/in6.h> /* For in6_dev_get/in6_dev_put */ #include <net/addrconf.h> #include <net/bonding.h> #include <rdma/ib_cache.h> #include <rdma/ib_addr.h> static struct workqueue_struct *gid_cache_wq; enum gid_op_type { GID_DEL = 0, GID_ADD }; struct update_gid_event_work { struct work_struct work; union ib_gid gid; struct ib_gid_attr gid_attr; enum gid_op_type gid_op; }; #define ROCE_NETDEV_CALLBACK_SZ 3 struct netdev_event_work_cmd { roce_netdev_callback cb; roce_netdev_filter filter; struct net_device *ndev; struct net_device *filter_ndev; }; struct netdev_event_work { struct work_struct work; struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ]; }; static const struct { bool (*is_supported)(const struct ib_device *device, u32 port_num); enum ib_gid_type gid_type; } PORT_CAP_TO_GID_TYPE[] = { {rdma_protocol_roce_eth_encap, IB_GID_TYPE_ROCE}, {rdma_protocol_roce_udp_encap, IB_GID_TYPE_ROCE_UDP_ENCAP}, }; #define CAP_TO_GID_TABLE_SIZE ARRAY_SIZE(PORT_CAP_TO_GID_TYPE) unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u32 port) { int i; unsigned int ret_flags = 0; if (!rdma_protocol_roce(ib_dev, port)) return 1UL << IB_GID_TYPE_IB; for (i = 0; i < CAP_TO_GID_TABLE_SIZE; i++) if (PORT_CAP_TO_GID_TYPE[i].is_supported(ib_dev, port)) ret_flags |= 1UL << PORT_CAP_TO_GID_TYPE[i].gid_type; return ret_flags; } EXPORT_SYMBOL(roce_gid_type_mask_support); static void update_gid(enum gid_op_type gid_op, struct ib_device *ib_dev, u32 port, union ib_gid *gid, struct ib_gid_attr *gid_attr) { int i; unsigned long gid_type_mask = roce_gid_type_mask_support(ib_dev, port); for (i = 0; i < IB_GID_TYPE_SIZE; i++) { if ((1UL << i) & gid_type_mask) { gid_attr->gid_type = i; switch (gid_op) { case GID_ADD: ib_cache_gid_add(ib_dev, port, gid, gid_attr); break; case GID_DEL: ib_cache_gid_del(ib_dev, port, gid, gid_attr); break; } } } } enum bonding_slave_state { BONDING_SLAVE_STATE_ACTIVE = 1UL << 0, BONDING_SLAVE_STATE_INACTIVE = 1UL << 1, /* No primary slave or the device isn't a slave in bonding */ BONDING_SLAVE_STATE_NA = 1UL << 2, }; static enum bonding_slave_state is_eth_active_slave_of_bonding_rcu(struct net_device *dev, struct net_device *upper) { if (upper && netif_is_bond_master(upper)) { struct net_device *pdev = bond_option_active_slave_get_rcu(netdev_priv(upper)); if (pdev) return dev == pdev ? BONDING_SLAVE_STATE_ACTIVE : BONDING_SLAVE_STATE_INACTIVE; } return BONDING_SLAVE_STATE_NA; } #define REQUIRED_BOND_STATES (BONDING_SLAVE_STATE_ACTIVE | \ BONDING_SLAVE_STATE_NA) static bool is_eth_port_of_netdev_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *real_dev; bool res; if (!rdma_ndev) return false; rcu_read_lock(); real_dev = rdma_vlan_dev_real_dev(cookie); if (!real_dev) real_dev = cookie; res = ((rdma_is_upper_dev_rcu(rdma_ndev, cookie) && (is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) & REQUIRED_BOND_STATES)) || real_dev == rdma_ndev); rcu_read_unlock(); return res; } static bool is_eth_port_inactive_slave_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *master_dev; bool res; if (!rdma_ndev) return false; rcu_read_lock(); master_dev = netdev_master_upper_dev_get_rcu(rdma_ndev); res = is_eth_active_slave_of_bonding_rcu(rdma_ndev, master_dev) == BONDING_SLAVE_STATE_INACTIVE; rcu_read_unlock(); return res; } /** * is_ndev_for_default_gid_filter - Check if a given netdevice * can be considered for default GIDs or not. * @ib_dev: IB device to check * @port: Port to consider for adding default GID * @rdma_ndev: rdma netdevice pointer * @cookie: Netdevice to consider to form a default GID * * is_ndev_for_default_gid_filter() returns true if a given netdevice can be * considered for deriving default RoCE GID, returns false otherwise. */ static bool is_ndev_for_default_gid_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *cookie_ndev = cookie; bool res; if (!rdma_ndev) return false; rcu_read_lock(); /* * When rdma netdevice is used in bonding, bonding master netdevice * should be considered for default GIDs. Therefore, ignore slave rdma * netdevices when bonding is considered. * Additionally when event(cookie) netdevice is bond master device, * make sure that it the upper netdevice of rdma netdevice. */ res = ((cookie_ndev == rdma_ndev && !netif_is_bond_slave(rdma_ndev)) || (netif_is_bond_master(cookie_ndev) && rdma_is_upper_dev_rcu(rdma_ndev, cookie_ndev))); rcu_read_unlock(); return res; } static bool pass_all_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { return true; } static bool upper_device_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { bool res; if (!rdma_ndev) return false; if (rdma_ndev == cookie) return true; rcu_read_lock(); res = rdma_is_upper_dev_rcu(rdma_ndev, cookie); rcu_read_unlock(); return res; } /** * is_upper_ndev_bond_master_filter - Check if a given netdevice * is bond master device of netdevice of the RDMA device of port. * @ib_dev: IB device to check * @port: Port to consider for adding default GID * @rdma_ndev: Pointer to rdma netdevice * @cookie: Netdevice to consider to form a default GID * * is_upper_ndev_bond_master_filter() returns true if a cookie_netdev * is bond master device and rdma_ndev is its lower netdevice. It might * not have been established as slave device yet. */ static bool is_upper_ndev_bond_master_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *cookie_ndev = cookie; bool match = false; if (!rdma_ndev) return false; rcu_read_lock(); if (netif_is_bond_master(cookie_ndev) && rdma_is_upper_dev_rcu(rdma_ndev, cookie_ndev)) match = true; rcu_read_unlock(); return match; } static void update_gid_ip(enum gid_op_type gid_op, struct ib_device *ib_dev, u32 port, struct net_device *ndev, struct sockaddr *addr) { union ib_gid gid; struct ib_gid_attr gid_attr; rdma_ip2gid(addr, &gid); memset(&gid_attr, 0, sizeof(gid_attr)); gid_attr.ndev = ndev; update_gid(gid_op, ib_dev, port, &gid, &gid_attr); } static void bond_delete_netdev_default_gids(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, struct net_device *event_ndev) { struct net_device *real_dev = rdma_vlan_dev_real_dev(event_ndev); unsigned long gid_type_mask; if (!rdma_ndev) return; if (!real_dev) real_dev = event_ndev; rcu_read_lock(); if (((rdma_ndev != event_ndev && !rdma_is_upper_dev_rcu(rdma_ndev, event_ndev)) || is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) == BONDING_SLAVE_STATE_INACTIVE)) { rcu_read_unlock(); return; } rcu_read_unlock(); gid_type_mask = roce_gid_type_mask_support(ib_dev, port); ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev, gid_type_mask, IB_CACHE_GID_DEFAULT_MODE_DELETE); } static void enum_netdev_ipv4_ips(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { const struct in_ifaddr *ifa; struct in_device *in_dev; struct sin_list { struct list_head list; struct sockaddr_in ip; }; struct sin_list *sin_iter; struct sin_list *sin_temp; LIST_HEAD(sin_list); if (ndev->reg_state >= NETREG_UNREGISTERING) return; rcu_read_lock(); in_dev = __in_dev_get_rcu(ndev); if (!in_dev) { rcu_read_unlock(); return; } in_dev_for_each_ifa_rcu(ifa, in_dev) { struct sin_list *entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) continue; entry->ip.sin_family = AF_INET; entry->ip.sin_addr.s_addr = ifa->ifa_address; list_add_tail(&entry->list, &sin_list); } rcu_read_unlock(); list_for_each_entry_safe(sin_iter, sin_temp, &sin_list, list) { update_gid_ip(GID_ADD, ib_dev, port, ndev, (struct sockaddr *)&sin_iter->ip); list_del(&sin_iter->list); kfree(sin_iter); } } static void enum_netdev_ipv6_ips(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { struct inet6_ifaddr *ifp; struct inet6_dev *in6_dev; struct sin6_list { struct list_head list; struct sockaddr_in6 sin6; }; struct sin6_list *sin6_iter; struct sin6_list *sin6_temp; struct ib_gid_attr gid_attr = {.ndev = ndev}; LIST_HEAD(sin6_list); if (ndev->reg_state >= NETREG_UNREGISTERING) return; in6_dev = in6_dev_get(ndev); if (!in6_dev) return; read_lock_bh(&in6_dev->lock); list_for_each_entry(ifp, &in6_dev->addr_list, if_list) { struct sin6_list *entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) continue; entry->sin6.sin6_family = AF_INET6; entry->sin6.sin6_addr = ifp->addr; list_add_tail(&entry->list, &sin6_list); } read_unlock_bh(&in6_dev->lock); in6_dev_put(in6_dev); list_for_each_entry_safe(sin6_iter, sin6_temp, &sin6_list, list) { union ib_gid gid; rdma_ip2gid((struct sockaddr *)&sin6_iter->sin6, &gid); update_gid(GID_ADD, ib_dev, port, &gid, &gid_attr); list_del(&sin6_iter->list); kfree(sin6_iter); } } static void _add_netdev_ips(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { enum_netdev_ipv4_ips(ib_dev, port, ndev); if (IS_ENABLED(CONFIG_IPV6)) enum_netdev_ipv6_ips(ib_dev, port, ndev); } static void add_netdev_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { _add_netdev_ips(ib_dev, port, cookie); } static void del_netdev_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { ib_cache_gid_del_all_netdev_gids(ib_dev, port, cookie); } /** * del_default_gids - Delete default GIDs of the event/cookie netdevice * @ib_dev: RDMA device pointer * @port: Port of the RDMA device whose GID table to consider * @rdma_ndev: Unused rdma netdevice * @cookie: Pointer to event netdevice * * del_default_gids() deletes the default GIDs of the event/cookie netdevice. */ static void del_default_gids(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *cookie_ndev = cookie; unsigned long gid_type_mask; gid_type_mask = roce_gid_type_mask_support(ib_dev, port); ib_cache_gid_set_default_gid(ib_dev, port, cookie_ndev, gid_type_mask, IB_CACHE_GID_DEFAULT_MODE_DELETE); } static void add_default_gids(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *event_ndev = cookie; unsigned long gid_type_mask; gid_type_mask = roce_gid_type_mask_support(ib_dev, port); ib_cache_gid_set_default_gid(ib_dev, port, event_ndev, gid_type_mask, IB_CACHE_GID_DEFAULT_MODE_SET); } static void enum_all_gids_of_dev_cb(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net *net; struct net_device *ndev; /* Lock the rtnl to make sure the netdevs does not move under * our feet */ rtnl_lock(); down_read(&net_rwsem); for_each_net(net) for_each_netdev(net, ndev) { /* * Filter and add default GIDs of the primary netdevice * when not in bonding mode, or add default GIDs * of bond master device, when in bonding mode. */ if (is_ndev_for_default_gid_filter(ib_dev, port, rdma_ndev, ndev)) add_default_gids(ib_dev, port, rdma_ndev, ndev); if (is_eth_port_of_netdev_filter(ib_dev, port, rdma_ndev, ndev)) _add_netdev_ips(ib_dev, port, ndev); } up_read(&net_rwsem); rtnl_unlock(); } /** * rdma_roce_rescan_device - Rescan all of the network devices in the system * and add their gids, as needed, to the relevant RoCE devices. * * @ib_dev: the rdma device */ void rdma_roce_rescan_device(struct ib_device *ib_dev) { ib_enum_roce_netdev(ib_dev, pass_all_filter, NULL, enum_all_gids_of_dev_cb, NULL); } EXPORT_SYMBOL(rdma_roce_rescan_device); /** * rdma_roce_rescan_port - Rescan all of the network devices in the system * and add their gids if relevant to the port of the RoCE device. * * @ib_dev: IB device * @port: Port number */ void rdma_roce_rescan_port(struct ib_device *ib_dev, u32 port) { struct net_device *ndev = NULL; if (rdma_protocol_roce(ib_dev, port)) { ndev = ib_device_get_netdev(ib_dev, port); if (!ndev) return; enum_all_gids_of_dev_cb(ib_dev, port, ndev, ndev); dev_put(ndev); } } EXPORT_SYMBOL(rdma_roce_rescan_port); static void callback_for_addr_gid_device_scan(struct ib_device *device, u32 port, struct net_device *rdma_ndev, void *cookie) { struct update_gid_event_work *parsed = cookie; return update_gid(parsed->gid_op, device, port, &parsed->gid, &parsed->gid_attr); } struct upper_list { struct list_head list; struct net_device *upper; }; static int netdev_upper_walk(struct net_device *upper, struct netdev_nested_priv *priv) { struct upper_list *entry = kmalloc(sizeof(*entry), GFP_ATOMIC); struct list_head *upper_list = (struct list_head *)priv->data; if (!entry) return 0; list_add_tail(&entry->list, upper_list); dev_hold(upper); entry->upper = upper; return 0; } static void handle_netdev_upper(struct ib_device *ib_dev, u32 port, void *cookie, void (*handle_netdev)(struct ib_device *ib_dev, u32 port, struct net_device *ndev)) { struct net_device *ndev = cookie; struct netdev_nested_priv priv; struct upper_list *upper_iter; struct upper_list *upper_temp; LIST_HEAD(upper_list); priv.data = &upper_list; rcu_read_lock(); netdev_walk_all_upper_dev_rcu(ndev, netdev_upper_walk, &priv); rcu_read_unlock(); handle_netdev(ib_dev, port, ndev); list_for_each_entry_safe(upper_iter, upper_temp, &upper_list, list) { handle_netdev(ib_dev, port, upper_iter->upper); dev_put(upper_iter->upper); list_del(&upper_iter->list); kfree(upper_iter); } } void roce_del_all_netdev_gids(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { ib_cache_gid_del_all_netdev_gids(ib_dev, port, ndev); } EXPORT_SYMBOL(roce_del_all_netdev_gids); static void del_netdev_upper_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { handle_netdev_upper(ib_dev, port, cookie, roce_del_all_netdev_gids); } static void add_netdev_upper_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { handle_netdev_upper(ib_dev, port, cookie, _add_netdev_ips); } static void del_netdev_default_ips_join(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *master_ndev; rcu_read_lock(); master_ndev = netdev_master_upper_dev_get_rcu(rdma_ndev); dev_hold(master_ndev); rcu_read_unlock(); if (master_ndev) { bond_delete_netdev_default_gids(ib_dev, port, rdma_ndev, master_ndev); dev_put(master_ndev); } } /* The following functions operate on all IB devices. netdevice_event and * addr_event execute ib_enum_all_roce_netdevs through a work. * ib_enum_all_roce_netdevs iterates through all IB devices. */ static void netdevice_event_work_handler(struct work_struct *_work) { struct netdev_event_work *work = container_of(_work, struct netdev_event_work, work); unsigned int i; for (i = 0; i < ARRAY_SIZE(work->cmds) && work->cmds[i].cb; i++) { ib_enum_all_roce_netdevs(work->cmds[i].filter, work->cmds[i].filter_ndev, work->cmds[i].cb, work->cmds[i].ndev); dev_put(work->cmds[i].ndev); dev_put(work->cmds[i].filter_ndev); } kfree(work); } static int netdevice_queue_work(struct netdev_event_work_cmd *cmds, struct net_device *ndev) { unsigned int i; struct netdev_event_work *ndev_work = kmalloc(sizeof(*ndev_work), GFP_KERNEL); if (!ndev_work) return NOTIFY_DONE; memcpy(ndev_work->cmds, cmds, sizeof(ndev_work->cmds)); for (i = 0; i < ARRAY_SIZE(ndev_work->cmds) && ndev_work->cmds[i].cb; i++) { if (!ndev_work->cmds[i].ndev) ndev_work->cmds[i].ndev = ndev; if (!ndev_work->cmds[i].filter_ndev) ndev_work->cmds[i].filter_ndev = ndev; dev_hold(ndev_work->cmds[i].ndev); dev_hold(ndev_work->cmds[i].filter_ndev); } INIT_WORK(&ndev_work->work, netdevice_event_work_handler); queue_work(gid_cache_wq, &ndev_work->work); return NOTIFY_DONE; } static const struct netdev_event_work_cmd add_cmd = { .cb = add_netdev_ips, .filter = is_eth_port_of_netdev_filter }; static const struct netdev_event_work_cmd add_cmd_upper_ips = { .cb = add_netdev_upper_ips, .filter = is_eth_port_of_netdev_filter }; static void ndev_event_unlink(struct netdev_notifier_changeupper_info *changeupper_info, struct netdev_event_work_cmd *cmds) { static const struct netdev_event_work_cmd upper_ips_del_cmd = { .cb = del_netdev_upper_ips, .filter = upper_device_filter }; cmds[0] = upper_ips_del_cmd; cmds[0].ndev = changeupper_info->upper_dev; cmds[1] = add_cmd; } static const struct netdev_event_work_cmd bonding_default_add_cmd = { .cb = add_default_gids, .filter = is_upper_ndev_bond_master_filter }; static void ndev_event_link(struct net_device *event_ndev, struct netdev_notifier_changeupper_info *changeupper_info, struct netdev_event_work_cmd *cmds) { static const struct netdev_event_work_cmd bonding_default_del_cmd = { .cb = del_default_gids, .filter = is_upper_ndev_bond_master_filter }; /* * When a lower netdev is linked to its upper bonding * netdev, delete lower slave netdev's default GIDs. */ cmds[0] = bonding_default_del_cmd; cmds[0].ndev = event_ndev; cmds[0].filter_ndev = changeupper_info->upper_dev; /* Now add bonding upper device default GIDs */ cmds[1] = bonding_default_add_cmd; cmds[1].ndev = changeupper_info->upper_dev; cmds[1].filter_ndev = changeupper_info->upper_dev; /* Now add bonding upper device IP based GIDs */ cmds[2] = add_cmd_upper_ips; cmds[2].ndev = changeupper_info->upper_dev; cmds[2].filter_ndev = changeupper_info->upper_dev; } static void netdevice_event_changeupper(struct net_device *event_ndev, struct netdev_notifier_changeupper_info *changeupper_info, struct netdev_event_work_cmd *cmds) { if (changeupper_info->linking) ndev_event_link(event_ndev, changeupper_info, cmds); else ndev_event_unlink(changeupper_info, cmds); } static const struct netdev_event_work_cmd add_default_gid_cmd = { .cb = add_default_gids, .filter = is_ndev_for_default_gid_filter, }; static int netdevice_event(struct notifier_block *this, unsigned long event, void *ptr) { static const struct netdev_event_work_cmd del_cmd = { .cb = del_netdev_ips, .filter = pass_all_filter}; static const struct netdev_event_work_cmd bonding_default_del_cmd_join = { .cb = del_netdev_default_ips_join, .filter = is_eth_port_inactive_slave_filter }; static const struct netdev_event_work_cmd netdev_del_cmd = { .cb = del_netdev_ips, .filter = is_eth_port_of_netdev_filter }; static const struct netdev_event_work_cmd bonding_event_ips_del_cmd = { .cb = del_netdev_upper_ips, .filter = upper_device_filter}; struct net_device *ndev = netdev_notifier_info_to_dev(ptr); struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ] = { {NULL} }; if (ndev->type != ARPHRD_ETHER) return NOTIFY_DONE; switch (event) { case NETDEV_REGISTER: case NETDEV_UP: cmds[0] = bonding_default_del_cmd_join; cmds[1] = add_default_gid_cmd; cmds[2] = add_cmd; break; case NETDEV_UNREGISTER: if (ndev->reg_state < NETREG_UNREGISTERED) cmds[0] = del_cmd; else return NOTIFY_DONE; break; case NETDEV_CHANGEADDR: cmds[0] = netdev_del_cmd; if (ndev->reg_state == NETREG_REGISTERED) { cmds[1] = add_default_gid_cmd; cmds[2] = add_cmd; } break; case NETDEV_CHANGEUPPER: netdevice_event_changeupper(ndev, container_of(ptr, struct netdev_notifier_changeupper_info, info), cmds); break; case NETDEV_BONDING_FAILOVER: cmds[0] = bonding_event_ips_del_cmd; /* Add default GIDs of the bond device */ cmds[1] = bonding_default_add_cmd; /* Add IP based GIDs of the bond device */ cmds[2] = add_cmd_upper_ips; break; default: return NOTIFY_DONE; } return netdevice_queue_work(cmds, ndev); } static void update_gid_event_work_handler(struct work_struct *_work) { struct update_gid_event_work *work = container_of(_work, struct update_gid_event_work, work); ib_enum_all_roce_netdevs(is_eth_port_of_netdev_filter, work->gid_attr.ndev, callback_for_addr_gid_device_scan, work); dev_put(work->gid_attr.ndev); kfree(work); } static int addr_event(struct notifier_block *this, unsigned long event, struct sockaddr *sa, struct net_device *ndev) { struct update_gid_event_work *work; enum gid_op_type gid_op; if (ndev->type != ARPHRD_ETHER) return NOTIFY_DONE; switch (event) { case NETDEV_UP: gid_op = GID_ADD; break; case NETDEV_DOWN: gid_op = GID_DEL; break; default: return NOTIFY_DONE; } work = kmalloc(sizeof(*work), GFP_ATOMIC); if (!work) return NOTIFY_DONE; INIT_WORK(&work->work, update_gid_event_work_handler); rdma_ip2gid(sa, &work->gid); work->gid_op = gid_op; memset(&work->gid_attr, 0, sizeof(work->gid_attr)); dev_hold(ndev); work->gid_attr.ndev = ndev; queue_work(gid_cache_wq, &work->work); return NOTIFY_DONE; } static int inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct sockaddr_in in; struct net_device *ndev; struct in_ifaddr *ifa = ptr; in.sin_family = AF_INET; in.sin_addr.s_addr = ifa->ifa_address; ndev = ifa->ifa_dev->dev; return addr_event(this, event, (struct sockaddr *)&in, ndev); } static int inet6addr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct sockaddr_in6 in6; struct net_device *ndev; struct inet6_ifaddr *ifa6 = ptr; in6.sin6_family = AF_INET6; in6.sin6_addr = ifa6->addr; ndev = ifa6->idev->dev; return addr_event(this, event, (struct sockaddr *)&in6, ndev); } static struct notifier_block nb_netdevice = { .notifier_call = netdevice_event }; static struct notifier_block nb_inetaddr = { .notifier_call = inetaddr_event }; static struct notifier_block nb_inet6addr = { .notifier_call = inet6addr_event }; int __init roce_gid_mgmt_init(void) { gid_cache_wq = alloc_ordered_workqueue("gid-cache-wq", 0); if (!gid_cache_wq) return -ENOMEM; register_inetaddr_notifier(&nb_inetaddr); if (IS_ENABLED(CONFIG_IPV6)) register_inet6addr_notifier(&nb_inet6addr); /* We relay on the netdevice notifier to enumerate all * existing devices in the system. Register to this notifier * last to make sure we will not miss any IP add/del * callbacks. */ register_netdevice_notifier(&nb_netdevice); return 0; } void __exit roce_gid_mgmt_cleanup(void) { if (IS_ENABLED(CONFIG_IPV6)) unregister_inet6addr_notifier(&nb_inet6addr); unregister_inetaddr_notifier(&nb_inetaddr); unregister_netdevice_notifier(&nb_netdevice); /* Ensure all gid deletion tasks complete before we go down, * to avoid any reference to free'd memory. By the time * ib-core is removed, all physical devices have been removed, * so no issue with remaining hardware contexts. */ destroy_workqueue(gid_cache_wq); }
12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright 2002-2005, Devicescape Software, Inc. * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright(c) 2015-2017 Intel Deutschland GmbH * Copyright(c) 2020-2024 Intel Corporation */ #ifndef STA_INFO_H #define STA_INFO_H #include <linux/list.h> #include <linux/types.h> #include <linux/if_ether.h> #include <linux/workqueue.h> #include <linux/average.h> #include <linux/bitfield.h> #include <linux/etherdevice.h> #include <linux/rhashtable.h> #include <linux/u64_stats_sync.h> #include "key.h" /** * enum ieee80211_sta_info_flags - Stations flags * * These flags are used with &struct sta_info's @flags member, but * only indirectly with set_sta_flag() and friends. * * @WLAN_STA_AUTH: Station is authenticated. * @WLAN_STA_ASSOC: Station is associated. * @WLAN_STA_PS_STA: Station is in power-save mode * @WLAN_STA_AUTHORIZED: Station is authorized to send/receive traffic. * This bit is always checked so needs to be enabled for all stations * when virtual port control is not in use. * @WLAN_STA_SHORT_PREAMBLE: Station is capable of receiving short-preamble * frames. * @WLAN_STA_WDS: Station is one of our WDS peers. * @WLAN_STA_CLEAR_PS_FILT: Clear PS filter in hardware (using the * IEEE80211_TX_CTL_CLEAR_PS_FILT control flag) when the next * frame to this station is transmitted. * @WLAN_STA_MFP: Management frame protection is used with this STA. * @WLAN_STA_BLOCK_BA: Used to deny ADDBA requests (both TX and RX) * during suspend/resume and station removal. * @WLAN_STA_PS_DRIVER: driver requires keeping this station in * power-save mode logically to flush frames that might still * be in the queues * @WLAN_STA_PSPOLL: Station sent PS-poll while driver was keeping * station in power-save mode, reply when the driver unblocks. * @WLAN_STA_TDLS_PEER: Station is a TDLS peer. * @WLAN_STA_TDLS_PEER_AUTH: This TDLS peer is authorized to send direct * packets. This means the link is enabled. * @WLAN_STA_TDLS_INITIATOR: We are the initiator of the TDLS link with this * station. * @WLAN_STA_TDLS_CHAN_SWITCH: This TDLS peer supports TDLS channel-switching * @WLAN_STA_TDLS_OFF_CHANNEL: The local STA is currently off-channel with this * TDLS peer * @WLAN_STA_TDLS_WIDER_BW: This TDLS peer supports working on a wider bw on * the BSS base channel. * @WLAN_STA_UAPSD: Station requested unscheduled SP while driver was * keeping station in power-save mode, reply when the driver * unblocks the station. * @WLAN_STA_SP: Station is in a service period, so don't try to * reply to other uAPSD trigger frames or PS-Poll. * @WLAN_STA_4ADDR_EVENT: 4-addr event was already sent for this frame. * @WLAN_STA_INSERTED: This station is inserted into the hash table. * @WLAN_STA_RATE_CONTROL: rate control was initialized for this station. * @WLAN_STA_TOFFSET_KNOWN: toffset calculated for this station is valid. * @WLAN_STA_MPSP_OWNER: local STA is owner of a mesh Peer Service Period. * @WLAN_STA_MPSP_RECIPIENT: local STA is recipient of a MPSP. * @WLAN_STA_PS_DELIVER: station woke up, but we're still blocking TX * until pending frames are delivered * @WLAN_STA_USES_ENCRYPTION: This station was configured for encryption, * so drop all packets without a key later. * @WLAN_STA_DECAP_OFFLOAD: This station uses rx decap offload * * @NUM_WLAN_STA_FLAGS: number of defined flags */ enum ieee80211_sta_info_flags { WLAN_STA_AUTH, WLAN_STA_ASSOC, WLAN_STA_PS_STA, WLAN_STA_AUTHORIZED, WLAN_STA_SHORT_PREAMBLE, WLAN_STA_WDS, WLAN_STA_CLEAR_PS_FILT, WLAN_STA_MFP, WLAN_STA_BLOCK_BA, WLAN_STA_PS_DRIVER, WLAN_STA_PSPOLL, WLAN_STA_TDLS_PEER, WLAN_STA_TDLS_PEER_AUTH, WLAN_STA_TDLS_INITIATOR, WLAN_STA_TDLS_CHAN_SWITCH, WLAN_STA_TDLS_OFF_CHANNEL, WLAN_STA_TDLS_WIDER_BW, WLAN_STA_UAPSD, WLAN_STA_SP, WLAN_STA_4ADDR_EVENT, WLAN_STA_INSERTED, WLAN_STA_RATE_CONTROL, WLAN_STA_TOFFSET_KNOWN, WLAN_STA_MPSP_OWNER, WLAN_STA_MPSP_RECIPIENT, WLAN_STA_PS_DELIVER, WLAN_STA_USES_ENCRYPTION, WLAN_STA_DECAP_OFFLOAD, NUM_WLAN_STA_FLAGS, }; #define ADDBA_RESP_INTERVAL HZ #define HT_AGG_MAX_RETRIES 15 #define HT_AGG_BURST_RETRIES 3 #define HT_AGG_RETRIES_PERIOD (15 * HZ) #define HT_AGG_STATE_DRV_READY 0 #define HT_AGG_STATE_RESPONSE_RECEIVED 1 #define HT_AGG_STATE_OPERATIONAL 2 #define HT_AGG_STATE_STOPPING 3 #define HT_AGG_STATE_WANT_START 4 #define HT_AGG_STATE_WANT_STOP 5 #define HT_AGG_STATE_START_CB 6 #define HT_AGG_STATE_STOP_CB 7 #define HT_AGG_STATE_SENT_ADDBA 8 DECLARE_EWMA(avg_signal, 10, 8) enum ieee80211_agg_stop_reason { AGG_STOP_DECLINED, AGG_STOP_LOCAL_REQUEST, AGG_STOP_PEER_REQUEST, AGG_STOP_DESTROY_STA, }; /* Debugfs flags to enable/disable use of RX/TX airtime in scheduler */ #define AIRTIME_USE_TX BIT(0) #define AIRTIME_USE_RX BIT(1) struct airtime_info { u64 rx_airtime; u64 tx_airtime; unsigned long last_active; s32 deficit; atomic_t aql_tx_pending; /* Estimated airtime for frames pending */ u32 aql_limit_low; u32 aql_limit_high; }; void ieee80211_sta_update_pending_airtime(struct ieee80211_local *local, struct sta_info *sta, u8 ac, u16 tx_airtime, bool tx_completed); struct sta_info; /** * struct tid_ampdu_tx - TID aggregation information (Tx). * * @rcu_head: rcu head for freeing structure * @session_timer: check if we keep Tx-ing on the TID (by timeout value) * @addba_resp_timer: timer for peer's response to addba request * @pending: pending frames queue -- use sta's spinlock to protect * @sta: station we are attached to * @dialog_token: dialog token for aggregation session * @timeout: session timeout value to be filled in ADDBA requests * @tid: TID number * @state: session state (see above) * @last_tx: jiffies of last tx activity * @stop_initiator: initiator of a session stop * @tx_stop: TX DelBA frame when stopping * @buf_size: reorder buffer size at receiver * @failed_bar_ssn: ssn of the last failed BAR tx attempt * @bar_pending: BAR needs to be re-sent * @amsdu: support A-MSDU within A-MDPU * @ssn: starting sequence number of the session * * This structure's lifetime is managed by RCU, assignments to * the array holding it must hold the aggregation mutex. * * The TX path can access it under RCU lock-free if, and * only if, the state has the flag %HT_AGG_STATE_OPERATIONAL * set. Otherwise, the TX path must also acquire the spinlock * and re-check the state, see comments in the tx code * touching it. */ struct tid_ampdu_tx { struct rcu_head rcu_head; struct timer_list session_timer; struct timer_list addba_resp_timer; struct sk_buff_head pending; struct sta_info *sta; unsigned long state; unsigned long last_tx; u16 timeout; u8 dialog_token; u8 stop_initiator; bool tx_stop; u16 buf_size; u16 ssn; u16 failed_bar_ssn; bool bar_pending; bool amsdu; u8 tid; }; /** * struct tid_ampdu_rx - TID aggregation information (Rx). * * @reorder_buf: buffer to reorder incoming aggregated MPDUs. An MPDU may be an * A-MSDU with individually reported subframes. * @reorder_buf_filtered: bitmap indicating where there are filtered frames in * the reorder buffer that should be ignored when releasing frames * @reorder_time: jiffies when skb was added * @session_timer: check if peer keeps Tx-ing on the TID (by timeout value) * @reorder_timer: releases expired frames from the reorder buffer. * @sta: station we are attached to * @last_rx: jiffies of last rx activity * @head_seq_num: head sequence number in reordering buffer. * @stored_mpdu_num: number of MPDUs in reordering buffer * @ssn: Starting Sequence Number expected to be aggregated. * @buf_size: buffer size for incoming A-MPDUs * @timeout: reset timer value (in TUs). * @tid: TID number * @rcu_head: RCU head used for freeing this struct * @reorder_lock: serializes access to reorder buffer, see below. * @auto_seq: used for offloaded BA sessions to automatically pick head_seq_and * and ssn. * @removed: this session is removed (but might have been found due to RCU) * @started: this session has started (head ssn or higher was received) * * This structure's lifetime is managed by RCU, assignments to * the array holding it must hold the aggregation mutex. * * The @reorder_lock is used to protect the members of this * struct, except for @timeout, @buf_size and @dialog_token, * which are constant across the lifetime of the struct (the * dialog token being used only for debugging). */ struct tid_ampdu_rx { struct rcu_head rcu_head; spinlock_t reorder_lock; u64 reorder_buf_filtered; struct sk_buff_head *reorder_buf; unsigned long *reorder_time; struct sta_info *sta; struct timer_list session_timer; struct timer_list reorder_timer; unsigned long last_rx; u16 head_seq_num; u16 stored_mpdu_num; u16 ssn; u16 buf_size; u16 timeout; u8 tid; u8 auto_seq:1, removed:1, started:1; }; /** * struct sta_ampdu_mlme - STA aggregation information. * * @tid_rx: aggregation info for Rx per TID -- RCU protected * @tid_rx_token: dialog tokens for valid aggregation sessions * @tid_rx_timer_expired: bitmap indicating on which TIDs the * RX timer expired until the work for it runs * @tid_rx_stop_requested: bitmap indicating which BA sessions per TID the * driver requested to close until the work for it runs * @tid_rx_manage_offl: bitmap indicating which BA sessions were requested * to be treated as started/stopped due to offloading * @agg_session_valid: bitmap indicating which TID has a rx BA session open on * @unexpected_agg: bitmap indicating which TID already sent a delBA due to * unexpected aggregation related frames outside a session * @work: work struct for starting/stopping aggregation * @tid_tx: aggregation info for Tx per TID * @tid_start_tx: sessions where start was requested, not just protected * by wiphy mutex but also sta->lock * @last_addba_req_time: timestamp of the last addBA request. * @addba_req_num: number of times addBA request has been sent. * @dialog_token_allocator: dialog token enumerator for each new session; */ struct sta_ampdu_mlme { /* rx */ struct tid_ampdu_rx __rcu *tid_rx[IEEE80211_NUM_TIDS]; u8 tid_rx_token[IEEE80211_NUM_TIDS]; unsigned long tid_rx_timer_expired[BITS_TO_LONGS(IEEE80211_NUM_TIDS)]; unsigned long tid_rx_stop_requested[BITS_TO_LONGS(IEEE80211_NUM_TIDS)]; unsigned long tid_rx_manage_offl[BITS_TO_LONGS(2 * IEEE80211_NUM_TIDS)]; unsigned long agg_session_valid[BITS_TO_LONGS(IEEE80211_NUM_TIDS)]; unsigned long unexpected_agg[BITS_TO_LONGS(IEEE80211_NUM_TIDS)]; /* tx */ struct wiphy_work work; struct tid_ampdu_tx __rcu *tid_tx[IEEE80211_NUM_TIDS]; struct tid_ampdu_tx *tid_start_tx[IEEE80211_NUM_TIDS]; unsigned long last_addba_req_time[IEEE80211_NUM_TIDS]; u8 addba_req_num[IEEE80211_NUM_TIDS]; u8 dialog_token_allocator; }; /* Value to indicate no TID reservation */ #define IEEE80211_TID_UNRESERVED 0xff #define IEEE80211_FAST_XMIT_MAX_IV 18 /** * struct ieee80211_fast_tx - TX fastpath information * @key: key to use for hw crypto * @hdr: the 802.11 header to put with the frame * @hdr_len: actual 802.11 header length * @sa_offs: offset of the SA * @da_offs: offset of the DA * @pn_offs: offset where to put PN for crypto (or 0 if not needed) * @band: band this will be transmitted on, for tx_info * @rcu_head: RCU head to free this struct * * This struct is small enough so that the common case (maximum crypto * header length of 8 like for CCMP/GCMP) fits into a single 64-byte * cache line. */ struct ieee80211_fast_tx { struct ieee80211_key *key; u8 hdr_len; u8 sa_offs, da_offs, pn_offs; u8 band; u8 hdr[30 + 2 + IEEE80211_FAST_XMIT_MAX_IV + sizeof(rfc1042_header)] __aligned(2); struct rcu_head rcu_head; }; /** * struct ieee80211_fast_rx - RX fastpath information * @dev: netdevice for reporting the SKB * @vif_type: (P2P-less) interface type of the original sdata (sdata->vif.type) * @vif_addr: interface address * @rfc1042_hdr: copy of the RFC 1042 SNAP header (to have in cache) * @control_port_protocol: control port protocol copied from sdata * @expected_ds_bits: from/to DS bits expected * @icv_len: length of the MIC if present * @key: bool indicating encryption is expected (key is set) * @internal_forward: forward froms internally on AP/VLAN type interfaces * @uses_rss: copy of USES_RSS hw flag * @da_offs: offset of the DA in the header (for header conversion) * @sa_offs: offset of the SA in the header (for header conversion) * @rcu_head: RCU head for freeing this structure */ struct ieee80211_fast_rx { struct net_device *dev; enum nl80211_iftype vif_type; u8 vif_addr[ETH_ALEN] __aligned(2); u8 rfc1042_hdr[6] __aligned(2); __be16 control_port_protocol; __le16 expected_ds_bits; u8 icv_len; u8 key:1, internal_forward:1, uses_rss:1; u8 da_offs, sa_offs; struct rcu_head rcu_head; }; /* we use only values in the range 0-100, so pick a large precision */ DECLARE_EWMA(mesh_fail_avg, 20, 8) DECLARE_EWMA(mesh_tx_rate_avg, 8, 16) /** * struct mesh_sta - mesh STA information * @plink_lock: serialize access to plink fields * @llid: Local link ID * @plid: Peer link ID * @aid: local aid supplied by peer * @reason: Cancel reason on PLINK_HOLDING state * @plink_retries: Retries in establishment * @plink_state: peer link state * @plink_timeout: timeout of peer link * @plink_timer: peer link watch timer * @plink_sta: peer link watch timer's sta_info * @t_offset: timing offset relative to this host * @t_offset_setpoint: reference timing offset of this sta to be used when * calculating clockdrift * @local_pm: local link-specific power save mode * @peer_pm: peer-specific power save mode towards local STA * @nonpeer_pm: STA power save mode towards non-peer neighbors * @processed_beacon: set to true after peer rates and capabilities are * processed * @connected_to_gate: true if mesh STA has a path to a mesh gate * @connected_to_as: true if mesh STA has a path to a authentication server * @fail_avg: moving percentage of failed MSDUs * @tx_rate_avg: moving average of tx bitrate */ struct mesh_sta { struct timer_list plink_timer; struct sta_info *plink_sta; s64 t_offset; s64 t_offset_setpoint; spinlock_t plink_lock; u16 llid; u16 plid; u16 aid; u16 reason; u8 plink_retries; bool processed_beacon; bool connected_to_gate; bool connected_to_as; enum nl80211_plink_state plink_state; u32 plink_timeout; /* mesh power save */ enum nl80211_mesh_power_mode local_pm; enum nl80211_mesh_power_mode peer_pm; enum nl80211_mesh_power_mode nonpeer_pm; /* moving percentage of failed MSDUs */ struct ewma_mesh_fail_avg fail_avg; /* moving average of tx bitrate */ struct ewma_mesh_tx_rate_avg tx_rate_avg; }; DECLARE_EWMA(signal, 10, 8) struct ieee80211_sta_rx_stats { unsigned long packets; unsigned long last_rx; unsigned long num_duplicates; unsigned long fragments; unsigned long dropped; int last_signal; u8 chains; s8 chain_signal_last[IEEE80211_MAX_CHAINS]; u32 last_rate; struct u64_stats_sync syncp; u64 bytes; u64 msdu[IEEE80211_NUM_TIDS + 1]; }; /* * IEEE 802.11-2016 (10.6 "Defragmentation") recommends support for "concurrent * reception of at least one MSDU per access category per associated STA" * on APs, or "at least one MSDU per access category" on other interface types. * * This limit can be increased by changing this define, at the cost of slower * frame reassembly and increased memory use while fragments are pending. */ #define IEEE80211_FRAGMENT_MAX 4 struct ieee80211_fragment_entry { struct sk_buff_head skb_list; unsigned long first_frag_time; u16 seq; u16 extra_len; u16 last_frag; u8 rx_queue; u8 check_sequential_pn:1, /* needed for CCMP/GCMP */ is_protected:1; u8 last_pn[6]; /* PN of the last fragment if CCMP was used */ unsigned int key_color; }; struct ieee80211_fragment_cache { struct ieee80211_fragment_entry entries[IEEE80211_FRAGMENT_MAX]; unsigned int next; }; /** * struct link_sta_info - Link STA information * All link specific sta info are stored here for reference. This can be * a single entry for non-MLD STA or multiple entries for MLD STA * @addr: Link MAC address - Can be same as MLD STA mac address and is always * same for non-MLD STA. This is used as key for searching link STA * @link_id: Link ID uniquely identifying the link STA. This is 0 for non-MLD * and set to the corresponding vif LinkId for MLD STA * @op_mode_nss: NSS limit as set by operating mode notification, or 0 * @capa_nss: NSS limit as determined by local and peer capabilities * @link_hash_node: hash node for rhashtable * @sta: Points to the STA info * @gtk: group keys negotiated with this station, if any * @tx_stats: TX statistics * @tx_stats.packets: # of packets transmitted * @tx_stats.bytes: # of bytes in all packets transmitted * @tx_stats.last_rate: last TX rate * @tx_stats.msdu: # of transmitted MSDUs per TID * @rx_stats: RX statistics * @rx_stats_avg: averaged RX statistics * @rx_stats_avg.signal: averaged signal * @rx_stats_avg.chain_signal: averaged per-chain signal * @pcpu_rx_stats: per-CPU RX statistics, assigned only if the driver needs * this (by advertising the USES_RSS hw flag) * @status_stats: TX status statistics * @status_stats.filtered: # of filtered frames * @status_stats.retry_failed: # of frames that failed after retry * @status_stats.retry_count: # of retries attempted * @status_stats.lost_packets: # of lost packets * @status_stats.last_pkt_time: timestamp of last ACKed packet * @status_stats.msdu_retries: # of MSDU retries * @status_stats.msdu_failed: # of failed MSDUs * @status_stats.last_ack: last ack timestamp (jiffies) * @status_stats.last_ack_signal: last ACK signal * @status_stats.ack_signal_filled: last ACK signal validity * @status_stats.avg_ack_signal: average ACK signal * @cur_max_bandwidth: maximum bandwidth to use for TX to the station, * taken from HT/VHT capabilities or VHT operating mode notification * @rx_omi_bw_rx: RX OMI bandwidth restriction to apply for RX * @rx_omi_bw_tx: RX OMI bandwidth restriction to apply for TX * @rx_omi_bw_staging: RX OMI bandwidth restriction to apply later * during finalize * @debugfs_dir: debug filesystem directory dentry * @pub: public (driver visible) link STA data * TODO Move other link params from sta_info as required for MLD operation */ struct link_sta_info { u8 addr[ETH_ALEN]; u8 link_id; u8 op_mode_nss, capa_nss; struct rhlist_head link_hash_node; struct sta_info *sta; struct ieee80211_key __rcu *gtk[NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS + NUM_DEFAULT_BEACON_KEYS]; struct ieee80211_sta_rx_stats __percpu *pcpu_rx_stats; /* Updated from RX path only, no locking requirements */ struct ieee80211_sta_rx_stats rx_stats; struct { struct ewma_signal signal; struct ewma_signal chain_signal[IEEE80211_MAX_CHAINS]; } rx_stats_avg; /* Updated from TX status path only, no locking requirements */ struct { unsigned long filtered; unsigned long retry_failed, retry_count; unsigned int lost_packets; unsigned long last_pkt_time; u64 msdu_retries[IEEE80211_NUM_TIDS + 1]; u64 msdu_failed[IEEE80211_NUM_TIDS + 1]; unsigned long last_ack; s8 last_ack_signal; bool ack_signal_filled; struct ewma_avg_signal avg_ack_signal; } status_stats; /* Updated from TX path only, no locking requirements */ struct { u64 packets[IEEE80211_NUM_ACS]; u64 bytes[IEEE80211_NUM_ACS]; struct ieee80211_tx_rate last_rate; struct rate_info last_rate_info; u64 msdu[IEEE80211_NUM_TIDS + 1]; } tx_stats; enum ieee80211_sta_rx_bandwidth cur_max_bandwidth; enum ieee80211_sta_rx_bandwidth rx_omi_bw_rx, rx_omi_bw_tx, rx_omi_bw_staging; #ifdef CONFIG_MAC80211_DEBUGFS struct dentry *debugfs_dir; #endif struct ieee80211_link_sta *pub; }; /** * struct ieee80211_sta_removed_link_stats - Removed link sta data * * keep required accumulated removed link data for stats * * @rx_packets: accumulated packets (MSDUs & MMPDUs) received from * this station for removed links * @tx_packets: accumulated packets (MSDUs & MMPDUs) transmitted to * this station for removed links * @rx_bytes: accumulated bytes (size of MPDUs) received from this * station for removed links * @tx_bytes: accumulated bytes (size of MPDUs) transmitted to this * station for removed links * @tx_retries: cumulative retry counts (MPDUs) for removed links * @tx_failed: accumulated number of failed transmissions (MPDUs) * (retries exceeded, no ACK) for removed links * @rx_dropped_misc: accumulated dropped packets for un-specified reason * from this station for removed links * @beacon_loss_count: Number of times beacon loss event has triggered * from this station for removed links. * @expected_throughput: expected throughput in kbps (including 802.11 * headers) towards this station for removed links * @pertid_stats: accumulated per-TID statistics for removed link of * station * @pertid_stats.rx_msdu : accumulated number of received MSDUs towards * this station for removed links. * @pertid_stats.tx_msdu: accumulated number of (attempted) transmitted * MSDUs towards this station for removed links * @pertid_stats.tx_msdu_retries: accumulated number of retries (not * counting the first) for transmitted MSDUs towards this station * for removed links * @pertid_stats.tx_msdu_failed: accumulated number of failed transmitted * MSDUs towards this station for removed links */ struct ieee80211_sta_removed_link_stats { u32 rx_packets; u32 tx_packets; u64 rx_bytes; u64 tx_bytes; u32 tx_retries; u32 tx_failed; u32 rx_dropped_misc; u32 beacon_loss_count; u32 expected_throughput; struct { u64 rx_msdu; u64 tx_msdu; u64 tx_msdu_retries; u64 tx_msdu_failed; } pertid_stats; }; /** * struct sta_info - STA information * * This structure collects information about a station that * mac80211 is communicating with. * * @list: global linked list entry * @free_list: list entry for keeping track of stations to free * @hash_node: hash node for rhashtable * @addr: station's MAC address - duplicated from public part to * let the hash table work with just a single cacheline * @local: pointer to the global information * @sdata: virtual interface this station belongs to * @ptk: peer keys negotiated with this station, if any * @ptk_idx: last installed peer key index * @rate_ctrl: rate control algorithm reference * @rate_ctrl_lock: spinlock used to protect rate control data * (data inside the algorithm, so serializes calls there) * @rate_ctrl_priv: rate control private per-STA pointer * @lock: used for locking all fields that require locking, see comments * in the header file. * @drv_deliver_wk: used for delivering frames after driver PS unblocking * @listen_interval: listen interval of this station, when we're acting as AP * @_flags: STA flags, see &enum ieee80211_sta_info_flags, do not use directly * @ps_lock: used for powersave (when mac80211 is the AP) related locking * @ps_tx_buf: buffers (per AC) of frames to transmit to this station * when it leaves power saving state or polls * @tx_filtered: buffers (per AC) of frames we already tried to * transmit but were filtered by hardware due to STA having * entered power saving state, these are also delivered to * the station when it leaves powersave or polls for frames * @driver_buffered_tids: bitmap of TIDs the driver has data buffered on * @txq_buffered_tids: bitmap of TIDs that mac80211 has txq data buffered on * @assoc_at: clock boottime (in ns) of last association * @last_connected: time (in seconds) when a station got connected * @last_seq_ctrl: last received seq/frag number from this STA (per TID * plus one for non-QoS frames) * @tid_seq: per-TID sequence numbers for sending to this STA * @airtime: per-AC struct airtime_info describing airtime statistics for this * station * @airtime_weight: station weight for airtime fairness calculation purposes * @ampdu_mlme: A-MPDU state machine state * @mesh: mesh STA information * @debugfs_dir: debug filesystem directory dentry * @dead: set to true when sta is unlinked * @removed: set to true when sta is being removed from sta_list * @uploaded: set to true when sta is uploaded to the driver * @sta: station information we share with the driver * @sta_state: duplicates information about station state (for debug) * @rcu_head: RCU head used for freeing this station struct * @reserved_tid: reserved TID (if any, otherwise IEEE80211_TID_UNRESERVED) * @amsdu_mesh_control: track the mesh A-MSDU format used by the peer: * * * -1: not yet known * * 0: non-mesh A-MSDU length field * * 1: big-endian mesh A-MSDU length field * * 2: little-endian mesh A-MSDU length field * * @fast_tx: TX fastpath information * @fast_rx: RX fastpath information * @tdls_chandef: a TDLS peer can have a wider chandef that is compatible to * the BSS one. * @frags: fragment cache * @cur: storage for aggregation data * &struct ieee80211_sta points either here or to deflink.agg. * @deflink: This is the default link STA information, for non MLO STA all link * specific STA information is accessed through @deflink or through * link[0] which points to address of @deflink. For MLO Link STA * the first added link STA will point to deflink. * @link: reference to Link Sta entries. For Non MLO STA, except 1st link, * i.e link[0] all links would be assigned to NULL by default and * would access link information via @deflink or link[0]. For MLO * STA, first link STA being added will point its link pointer to * @deflink address and remaining would be allocated and the address * would be assigned to link[link_id] where link_id is the id assigned * by the AP. * @rem_link_stats: accumulated removed link stats */ struct sta_info { /* General information, mostly static */ struct list_head list, free_list; struct rcu_head rcu_head; struct rhlist_head hash_node; u8 addr[ETH_ALEN]; struct ieee80211_local *local; struct ieee80211_sub_if_data *sdata; struct ieee80211_key __rcu *ptk[NUM_DEFAULT_KEYS]; u8 ptk_idx; struct rate_control_ref *rate_ctrl; void *rate_ctrl_priv; spinlock_t rate_ctrl_lock; spinlock_t lock; struct ieee80211_fast_tx __rcu *fast_tx; struct ieee80211_fast_rx __rcu *fast_rx; #ifdef CONFIG_MAC80211_MESH struct mesh_sta *mesh; #endif struct work_struct drv_deliver_wk; u16 listen_interval; bool dead; bool removed; bool uploaded; enum ieee80211_sta_state sta_state; /* use the accessors defined below */ unsigned long _flags; /* STA powersave lock and frame queues */ spinlock_t ps_lock; struct sk_buff_head ps_tx_buf[IEEE80211_NUM_ACS]; struct sk_buff_head tx_filtered[IEEE80211_NUM_ACS]; unsigned long driver_buffered_tids; unsigned long txq_buffered_tids; u64 assoc_at; long last_connected; /* Plus 1 for non-QoS frames */ __le16 last_seq_ctrl[IEEE80211_NUM_TIDS + 1]; u16 tid_seq[IEEE80211_QOS_CTL_TID_MASK + 1]; struct airtime_info airtime[IEEE80211_NUM_ACS]; u16 airtime_weight; /* * Aggregation information, locked with lock. */ struct sta_ampdu_mlme ampdu_mlme; #ifdef CONFIG_MAC80211_DEBUGFS struct dentry *debugfs_dir; #endif u8 reserved_tid; s8 amsdu_mesh_control; struct cfg80211_chan_def tdls_chandef; struct ieee80211_fragment_cache frags; struct ieee80211_sta_aggregates cur; struct link_sta_info deflink; struct link_sta_info __rcu *link[IEEE80211_MLD_MAX_NUM_LINKS]; struct ieee80211_sta_removed_link_stats rem_link_stats; /* keep last! */ struct ieee80211_sta sta; }; static inline int ieee80211_tdls_sta_link_id(struct sta_info *sta) { /* TDLS STA can only have a single link */ return sta->sta.valid_links ? __ffs(sta->sta.valid_links) : 0; } static inline enum nl80211_plink_state sta_plink_state(struct sta_info *sta) { #ifdef CONFIG_MAC80211_MESH return sta->mesh->plink_state; #endif return NL80211_PLINK_LISTEN; } static inline void set_sta_flag(struct sta_info *sta, enum ieee80211_sta_info_flags flag) { WARN_ON(flag == WLAN_STA_AUTH || flag == WLAN_STA_ASSOC || flag == WLAN_STA_AUTHORIZED); set_bit(flag, &sta->_flags); } static inline void clear_sta_flag(struct sta_info *sta, enum ieee80211_sta_info_flags flag) { WARN_ON(flag == WLAN_STA_AUTH || flag == WLAN_STA_ASSOC || flag == WLAN_STA_AUTHORIZED); clear_bit(flag, &sta->_flags); } static inline int test_sta_flag(struct sta_info *sta, enum ieee80211_sta_info_flags flag) { return test_bit(flag, &sta->_flags); } static inline int test_and_clear_sta_flag(struct sta_info *sta, enum ieee80211_sta_info_flags flag) { WARN_ON(flag == WLAN_STA_AUTH || flag == WLAN_STA_ASSOC || flag == WLAN_STA_AUTHORIZED); return test_and_clear_bit(flag, &sta->_flags); } static inline int test_and_set_sta_flag(struct sta_info *sta, enum ieee80211_sta_info_flags flag) { WARN_ON(flag == WLAN_STA_AUTH || flag == WLAN_STA_ASSOC || flag == WLAN_STA_AUTHORIZED); return test_and_set_bit(flag, &sta->_flags); } int sta_info_move_state(struct sta_info *sta, enum ieee80211_sta_state new_state); static inline void sta_info_pre_move_state(struct sta_info *sta, enum ieee80211_sta_state new_state) { int ret; WARN_ON_ONCE(test_sta_flag(sta, WLAN_STA_INSERTED)); ret = sta_info_move_state(sta, new_state); WARN_ON_ONCE(ret); } void ieee80211_assign_tid_tx(struct sta_info *sta, int tid, struct tid_ampdu_tx *tid_tx); #define rcu_dereference_protected_tid_tx(sta, tid) \ rcu_dereference_protected((sta)->ampdu_mlme.tid_tx[tid], \ lockdep_is_held(&(sta)->lock) || \ lockdep_is_held(&(sta)->local->hw.wiphy->mtx)); /* Maximum number of frames to buffer per power saving station per AC */ #define STA_MAX_TX_BUFFER 64 /* Minimum buffered frame expiry time. If STA uses listen interval that is * smaller than this value, the minimum value here is used instead. */ #define STA_TX_BUFFER_EXPIRE (10 * HZ) /* How often station data is cleaned up (e.g., expiration of buffered frames) */ #define STA_INFO_CLEANUP_INTERVAL (10 * HZ) struct rhlist_head *sta_info_hash_lookup(struct ieee80211_local *local, const u8 *addr); /* * Get a STA info, must be under RCU read lock. */ struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata, const u8 *addr); struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata, const u8 *addr); /* user must hold wiphy mutex or be in RCU critical section */ struct sta_info *sta_info_get_by_addrs(struct ieee80211_local *local, const u8 *sta_addr, const u8 *vif_addr); #define for_each_sta_info(local, _addr, _sta, _tmp) \ rhl_for_each_entry_rcu(_sta, _tmp, \ sta_info_hash_lookup(local, _addr), hash_node) struct rhlist_head *link_sta_info_hash_lookup(struct ieee80211_local *local, const u8 *addr); #define for_each_link_sta_info(local, _addr, _sta, _tmp) \ rhl_for_each_entry_rcu(_sta, _tmp, \ link_sta_info_hash_lookup(local, _addr), \ link_hash_node) struct link_sta_info * link_sta_info_get_bss(struct ieee80211_sub_if_data *sdata, const u8 *addr); /* * Get STA info by index, BROKEN! */ struct sta_info *sta_info_get_by_idx(struct ieee80211_sub_if_data *sdata, int idx); /* * Create a new STA info, caller owns returned structure * until sta_info_insert(). */ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, const u8 *addr, gfp_t gfp); struct sta_info *sta_info_alloc_with_link(struct ieee80211_sub_if_data *sdata, const u8 *mld_addr, unsigned int link_id, const u8 *link_addr, gfp_t gfp); void sta_info_free(struct ieee80211_local *local, struct sta_info *sta); /* * Insert STA info into hash table/list, returns zero or a * -EEXIST if (if the same MAC address is already present). * * Calling the non-rcu version makes the caller relinquish, * the _rcu version calls read_lock_rcu() and must be called * without it held. */ int sta_info_insert(struct sta_info *sta); int sta_info_insert_rcu(struct sta_info *sta) __acquires(RCU); int __must_check __sta_info_destroy(struct sta_info *sta); int sta_info_destroy_addr(struct ieee80211_sub_if_data *sdata, const u8 *addr); int sta_info_destroy_addr_bss(struct ieee80211_sub_if_data *sdata, const u8 *addr); void sta_info_recalc_tim(struct sta_info *sta); int sta_info_init(struct ieee80211_local *local); void sta_info_stop(struct ieee80211_local *local); /** * __sta_info_flush - flush matching STA entries from the STA table * * Return: the number of removed STA entries. * * @sdata: sdata to remove all stations from * @vlans: if the given interface is an AP interface, also flush VLANs * @link_id: if given (>=0), all those STA entries using @link_id only * will be removed. If -1 is passed, all STA entries will be * removed. * @do_not_flush_sta: a station that shouldn't be flushed. */ int __sta_info_flush(struct ieee80211_sub_if_data *sdata, bool vlans, int link_id, struct sta_info *do_not_flush_sta); /** * sta_info_flush - flush matching STA entries from the STA table * * Return: the number of removed STA entries. * * @sdata: sdata to remove all stations from * @link_id: if given (>=0), all those STA entries using @link_id only * will be removed. If -1 is passed, all STA entries will be * removed. */ static inline int sta_info_flush(struct ieee80211_sub_if_data *sdata, int link_id) { return __sta_info_flush(sdata, false, link_id, NULL); } void sta_set_rate_info_tx(struct sta_info *sta, const struct ieee80211_tx_rate *rate, struct rate_info *rinfo); void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, bool tidstats); void sta_set_accumulated_removed_links_sinfo(struct sta_info *sta, struct station_info *sinfo); u32 sta_get_expected_throughput(struct sta_info *sta); void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata, unsigned long exp_time); int ieee80211_sta_allocate_link(struct sta_info *sta, unsigned int link_id); void ieee80211_sta_free_link(struct sta_info *sta, unsigned int link_id); int ieee80211_sta_activate_link(struct sta_info *sta, unsigned int link_id); void ieee80211_sta_remove_link(struct sta_info *sta, unsigned int link_id); void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta); void ieee80211_sta_ps_deliver_poll_response(struct sta_info *sta); void ieee80211_sta_ps_deliver_uapsd(struct sta_info *sta); unsigned long ieee80211_sta_last_active(struct sta_info *sta, int link_id); void ieee80211_sta_set_max_amsdu_subframes(struct sta_info *sta, const u8 *ext_capab, unsigned int ext_capab_len); void __ieee80211_sta_recalc_aggregates(struct sta_info *sta, u16 active_links); enum sta_stats_type { STA_STATS_RATE_TYPE_INVALID = 0, STA_STATS_RATE_TYPE_LEGACY, STA_STATS_RATE_TYPE_HT, STA_STATS_RATE_TYPE_VHT, STA_STATS_RATE_TYPE_HE, STA_STATS_RATE_TYPE_S1G, STA_STATS_RATE_TYPE_EHT, }; #define STA_STATS_FIELD_HT_MCS GENMASK( 7, 0) #define STA_STATS_FIELD_LEGACY_IDX GENMASK( 3, 0) #define STA_STATS_FIELD_LEGACY_BAND GENMASK( 7, 4) #define STA_STATS_FIELD_VHT_MCS GENMASK( 3, 0) #define STA_STATS_FIELD_VHT_NSS GENMASK( 7, 4) #define STA_STATS_FIELD_HE_MCS GENMASK( 3, 0) #define STA_STATS_FIELD_HE_NSS GENMASK( 7, 4) #define STA_STATS_FIELD_EHT_MCS GENMASK( 3, 0) #define STA_STATS_FIELD_EHT_NSS GENMASK( 7, 4) #define STA_STATS_FIELD_BW GENMASK(12, 8) #define STA_STATS_FIELD_SGI GENMASK(13, 13) #define STA_STATS_FIELD_TYPE GENMASK(16, 14) #define STA_STATS_FIELD_HE_RU GENMASK(19, 17) #define STA_STATS_FIELD_HE_GI GENMASK(21, 20) #define STA_STATS_FIELD_HE_DCM GENMASK(22, 22) #define STA_STATS_FIELD_EHT_RU GENMASK(20, 17) #define STA_STATS_FIELD_EHT_GI GENMASK(22, 21) #define STA_STATS_FIELD(_n, _v) FIELD_PREP(STA_STATS_FIELD_ ## _n, _v) #define STA_STATS_GET(_n, _v) FIELD_GET(STA_STATS_FIELD_ ## _n, _v) #define STA_STATS_RATE_INVALID 0 static inline u32 sta_stats_encode_rate(struct ieee80211_rx_status *s) { u32 r; r = STA_STATS_FIELD(BW, s->bw); if (s->enc_flags & RX_ENC_FLAG_SHORT_GI) r |= STA_STATS_FIELD(SGI, 1); switch (s->encoding) { case RX_ENC_VHT: r |= STA_STATS_FIELD(TYPE, STA_STATS_RATE_TYPE_VHT); r |= STA_STATS_FIELD(VHT_NSS, s->nss); r |= STA_STATS_FIELD(VHT_MCS, s->rate_idx); break; case RX_ENC_HT: r |= STA_STATS_FIELD(TYPE, STA_STATS_RATE_TYPE_HT); r |= STA_STATS_FIELD(HT_MCS, s->rate_idx); break; case RX_ENC_LEGACY: r |= STA_STATS_FIELD(TYPE, STA_STATS_RATE_TYPE_LEGACY); r |= STA_STATS_FIELD(LEGACY_BAND, s->band); r |= STA_STATS_FIELD(LEGACY_IDX, s->rate_idx); break; case RX_ENC_HE: r |= STA_STATS_FIELD(TYPE, STA_STATS_RATE_TYPE_HE); r |= STA_STATS_FIELD(HE_NSS, s->nss); r |= STA_STATS_FIELD(HE_MCS, s->rate_idx); r |= STA_STATS_FIELD(HE_GI, s->he_gi); r |= STA_STATS_FIELD(HE_RU, s->he_ru); r |= STA_STATS_FIELD(HE_DCM, s->he_dcm); break; case RX_ENC_EHT: r |= STA_STATS_FIELD(TYPE, STA_STATS_RATE_TYPE_EHT); r |= STA_STATS_FIELD(EHT_NSS, s->nss); r |= STA_STATS_FIELD(EHT_MCS, s->rate_idx); r |= STA_STATS_FIELD(EHT_GI, s->eht.gi); r |= STA_STATS_FIELD(EHT_RU, s->eht.ru); break; default: WARN_ON(1); return STA_STATS_RATE_INVALID; } return r; } #endif /* STA_INFO_H */
30 30 20 20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 // SPDX-License-Identifier: GPL-2.0-or-later /* Signature verification with an asymmetric key * * See Documentation/crypto/asymmetric-keys.rst * * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define pr_fmt(fmt) "SIG: "fmt #include <keys/asymmetric-subtype.h> #include <linux/export.h> #include <linux/err.h> #include <linux/slab.h> #include <linux/keyctl.h> #include <crypto/public_key.h> #include <keys/user-type.h> #include "asymmetric_keys.h" /* * Destroy a public key signature. */ void public_key_signature_free(struct public_key_signature *sig) { int i; if (sig) { for (i = 0; i < ARRAY_SIZE(sig->auth_ids); i++) kfree(sig->auth_ids[i]); kfree(sig->s); kfree(sig->digest); kfree(sig); } } EXPORT_SYMBOL_GPL(public_key_signature_free); /** * query_asymmetric_key - Get information about an asymmetric key. * @params: Various parameters. * @info: Where to put the information. */ int query_asymmetric_key(const struct kernel_pkey_params *params, struct kernel_pkey_query *info) { const struct asymmetric_key_subtype *subtype; struct key *key = params->key; int ret; pr_devel("==>%s()\n", __func__); if (key->type != &key_type_asymmetric) return -EINVAL; subtype = asymmetric_key_subtype(key); if (!subtype || !key->payload.data[0]) return -EINVAL; if (!subtype->query) return -ENOTSUPP; ret = subtype->query(params, info); pr_devel("<==%s() = %d\n", __func__, ret); return ret; } EXPORT_SYMBOL_GPL(query_asymmetric_key); /** * verify_signature - Initiate the use of an asymmetric key to verify a signature * @key: The asymmetric key to verify against * @sig: The signature to check * * Returns 0 if successful or else an error. */ int verify_signature(const struct key *key, const struct public_key_signature *sig) { const struct asymmetric_key_subtype *subtype; int ret; pr_devel("==>%s()\n", __func__); if (key->type != &key_type_asymmetric) return -EINVAL; subtype = asymmetric_key_subtype(key); if (!subtype || !key->payload.data[0]) return -EINVAL; if (!subtype->verify_signature) return -ENOTSUPP; ret = subtype->verify_signature(key, sig); pr_devel("<==%s() = %d\n", __func__, ret); return ret; } EXPORT_SYMBOL_GPL(verify_signature);
625 625 625 625 625 200 202 202 202 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 // SPDX-License-Identifier: GPL-2.0 /* sysfs entries for device PM */ #include <linux/device.h> #include <linux/kobject.h> #include <linux/string.h> #include <linux/export.h> #include <linux/pm_qos.h> #include <linux/pm_runtime.h> #include <linux/atomic.h> #include <linux/jiffies.h> #include "power.h" /* * control - Report/change current runtime PM setting of the device * * Runtime power management of a device can be blocked with the help of * this attribute. All devices have one of the following two values for * the power/control file: * * + "auto\n" to allow the device to be power managed at run time; * + "on\n" to prevent the device from being power managed at run time; * * The default for all devices is "auto", which means that devices may be * subject to automatic power management, depending on their drivers. * Changing this attribute to "on" prevents the driver from power managing * the device at run time. Doing that while the device is suspended causes * it to be woken up. * * wakeup - Report/change current wakeup option for device * * Some devices support "wakeup" events, which are hardware signals * used to activate devices from suspended or low power states. Such * devices have one of three values for the sysfs power/wakeup file: * * + "enabled\n" to issue the events; * + "disabled\n" not to do so; or * + "\n" for temporary or permanent inability to issue wakeup. * * (For example, unconfigured USB devices can't issue wakeups.) * * Familiar examples of devices that can issue wakeup events include * keyboards and mice (both PS2 and USB styles), power buttons, modems, * "Wake-On-LAN" Ethernet links, GPIO lines, and more. Some events * will wake the entire system from a suspend state; others may just * wake up the device (if the system as a whole is already active). * Some wakeup events use normal IRQ lines; other use special out * of band signaling. * * It is the responsibility of device drivers to enable (or disable) * wakeup signaling as part of changing device power states, respecting * the policy choices provided through the driver model. * * Devices may not be able to generate wakeup events from all power * states. Also, the events may be ignored in some configurations; * for example, they might need help from other devices that aren't * active, or which may have wakeup disabled. Some drivers rely on * wakeup events internally (unless they are disabled), keeping * their hardware in low power modes whenever they're unused. This * saves runtime power, without requiring system-wide sleep states. * * async - Report/change current async suspend setting for the device * * Asynchronous suspend and resume of the device during system-wide power * state transitions can be enabled by writing "enabled" to this file. * Analogously, if "disabled" is written to this file, the device will be * suspended and resumed synchronously. * * All devices have one of the following two values for power/async: * * + "enabled\n" to permit the asynchronous suspend/resume of the device; * + "disabled\n" to forbid it; * * NOTE: It generally is unsafe to permit the asynchronous suspend/resume * of a device unless it is certain that all of the PM dependencies of the * device are known to the PM core. However, for some devices this * attribute is set to "enabled" by bus type code or device drivers and in * that cases it should be safe to leave the default value. * * autosuspend_delay_ms - Report/change a device's autosuspend_delay value * * Some drivers don't want to carry out a runtime suspend as soon as a * device becomes idle; they want it always to remain idle for some period * of time before suspending it. This period is the autosuspend_delay * value (expressed in milliseconds) and it can be controlled by the user. * If the value is negative then the device will never be runtime * suspended. * * NOTE: The autosuspend_delay_ms attribute and the autosuspend_delay * value are used only if the driver calls pm_runtime_use_autosuspend(). * * wakeup_count - Report the number of wakeup events related to the device */ const char power_group_name[] = "power"; EXPORT_SYMBOL_GPL(power_group_name); static const char ctrl_auto[] = "auto"; static const char ctrl_on[] = "on"; static ssize_t control_show(struct device *dev, struct device_attribute *attr, char *buf) { return sysfs_emit(buf, "%s\n", dev->power.runtime_auto ? ctrl_auto : ctrl_on); } static ssize_t control_store(struct device * dev, struct device_attribute *attr, const char * buf, size_t n) { device_lock(dev); if (sysfs_streq(buf, ctrl_auto)) pm_runtime_allow(dev); else if (sysfs_streq(buf, ctrl_on)) pm_runtime_forbid(dev); else n = -EINVAL; device_unlock(dev); return n; } static DEVICE_ATTR_RW(control); static ssize_t runtime_active_time_show(struct device *dev, struct device_attribute *attr, char *buf) { u64 tmp = pm_runtime_active_time(dev); do_div(tmp, NSEC_PER_MSEC); return sysfs_emit(buf, "%llu\n", tmp); } static DEVICE_ATTR_RO(runtime_active_time); static ssize_t runtime_suspended_time_show(struct device *dev, struct device_attribute *attr, char *buf) { u64 tmp = pm_runtime_suspended_time(dev); do_div(tmp, NSEC_PER_MSEC); return sysfs_emit(buf, "%llu\n", tmp); } static DEVICE_ATTR_RO(runtime_suspended_time); static ssize_t runtime_status_show(struct device *dev, struct device_attribute *attr, char *buf) { const char *output; if (dev->power.runtime_error) { output = "error"; } else if (dev->power.disable_depth) { output = "unsupported"; } else { switch (dev->power.runtime_status) { case RPM_SUSPENDED: output = "suspended"; break; case RPM_SUSPENDING: output = "suspending"; break; case RPM_RESUMING: output = "resuming"; break; case RPM_ACTIVE: output = "active"; break; default: return -EIO; } } return sysfs_emit(buf, "%s\n", output); } static DEVICE_ATTR_RO(runtime_status); static ssize_t autosuspend_delay_ms_show(struct device *dev, struct device_attribute *attr, char *buf) { if (!dev->power.use_autosuspend) return -EIO; return sysfs_emit(buf, "%d\n", dev->power.autosuspend_delay); } static ssize_t autosuspend_delay_ms_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t n) { long delay; if (!dev->power.use_autosuspend) return -EIO; if (kstrtol(buf, 10, &delay) != 0 || delay != (int) delay) return -EINVAL; device_lock(dev); pm_runtime_set_autosuspend_delay(dev, delay); device_unlock(dev); return n; } static DEVICE_ATTR_RW(autosuspend_delay_ms); static ssize_t pm_qos_resume_latency_us_show(struct device *dev, struct device_attribute *attr, char *buf) { s32 value = dev_pm_qos_requested_resume_latency(dev); if (value == 0) return sysfs_emit(buf, "n/a\n"); if (value == PM_QOS_RESUME_LATENCY_NO_CONSTRAINT) value = 0; return sysfs_emit(buf, "%d\n", value); } static ssize_t pm_qos_resume_latency_us_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t n) { s32 value; int ret; if (!kstrtos32(buf, 0, &value)) { /* * Prevent users from writing negative or "no constraint" values * directly. */ if (value < 0 || value == PM_QOS_RESUME_LATENCY_NO_CONSTRAINT) return -EINVAL; if (value == 0) value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT; } else if (sysfs_streq(buf, "n/a")) { value = 0; } else { return -EINVAL; } ret = dev_pm_qos_update_request(dev->power.qos->resume_latency_req, value); return ret < 0 ? ret : n; } static DEVICE_ATTR_RW(pm_qos_resume_latency_us); static ssize_t pm_qos_latency_tolerance_us_show(struct device *dev, struct device_attribute *attr, char *buf) { s32 value = dev_pm_qos_get_user_latency_tolerance(dev); if (value < 0) return sysfs_emit(buf, "%s\n", "auto"); if (value == PM_QOS_LATENCY_ANY) return sysfs_emit(buf, "%s\n", "any"); return sysfs_emit(buf, "%d\n", value); } static ssize_t pm_qos_latency_tolerance_us_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t n) { s32 value; int ret; if (kstrtos32(buf, 0, &value) == 0) { /* Users can't write negative values directly */ if (value < 0) return -EINVAL; } else { if (sysfs_streq(buf, "auto")) value = PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT; else if (sysfs_streq(buf, "any")) value = PM_QOS_LATENCY_ANY; else return -EINVAL; } ret = dev_pm_qos_update_user_latency_tolerance(dev, value); return ret < 0 ? ret : n; } static DEVICE_ATTR_RW(pm_qos_latency_tolerance_us); static ssize_t pm_qos_no_power_off_show(struct device *dev, struct device_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", !!(dev_pm_qos_requested_flags(dev) & PM_QOS_FLAG_NO_POWER_OFF)); } static ssize_t pm_qos_no_power_off_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t n) { int ret; if (kstrtoint(buf, 0, &ret)) return -EINVAL; if (ret != 0 && ret != 1) return -EINVAL; ret = dev_pm_qos_update_flags(dev, PM_QOS_FLAG_NO_POWER_OFF, ret); return ret < 0 ? ret : n; } static DEVICE_ATTR_RW(pm_qos_no_power_off); #ifdef CONFIG_PM_SLEEP static const char _enabled[] = "enabled"; static const char _disabled[] = "disabled"; static ssize_t wakeup_show(struct device *dev, struct device_attribute *attr, char *buf) { return sysfs_emit(buf, "%s\n", device_can_wakeup(dev) ? (device_may_wakeup(dev) ? _enabled : _disabled) : ""); } static ssize_t wakeup_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t n) { if (!device_can_wakeup(dev)) return -EINVAL; if (sysfs_streq(buf, _enabled)) device_set_wakeup_enable(dev, 1); else if (sysfs_streq(buf, _disabled)) device_set_wakeup_enable(dev, 0); else return -EINVAL; return n; } static DEVICE_ATTR_RW(wakeup); static ssize_t wakeup_count_show(struct device *dev, struct device_attribute *attr, char *buf) { unsigned long count; bool enabled = false; spin_lock_irq(&dev->power.lock); if (dev->power.wakeup) { count = dev->power.wakeup->wakeup_count; enabled = true; } spin_unlock_irq(&dev->power.lock); if (!enabled) return sysfs_emit(buf, "\n"); return sysfs_emit(buf, "%lu\n", count); } static DEVICE_ATTR_RO(wakeup_count); static ssize_t wakeup_active_count_show(struct device *dev, struct device_attribute *attr, char *buf) { unsigned long count; bool enabled = false; spin_lock_irq(&dev->power.lock); if (dev->power.wakeup) { count = dev->power.wakeup->active_count; enabled = true; } spin_unlock_irq(&dev->power.lock); if (!enabled) return sysfs_emit(buf, "\n"); return sysfs_emit(buf, "%lu\n", count); } static DEVICE_ATTR_RO(wakeup_active_count); static ssize_t wakeup_abort_count_show(struct device *dev, struct device_attribute *attr, char *buf) { unsigned long count; bool enabled = false; spin_lock_irq(&dev->power.lock); if (dev->power.wakeup) { count = dev->power.wakeup->wakeup_count; enabled = true; } spin_unlock_irq(&dev->power.lock); if (!enabled) return sysfs_emit(buf, "\n"); return sysfs_emit(buf, "%lu\n", count); } static DEVICE_ATTR_RO(wakeup_abort_count); static ssize_t wakeup_expire_count_show(struct device *dev, struct device_attribute *attr, char *buf) { unsigned long count; bool enabled = false; spin_lock_irq(&dev->power.lock); if (dev->power.wakeup) { count = dev->power.wakeup->expire_count; enabled = true; } spin_unlock_irq(&dev->power.lock); if (!enabled) return sysfs_emit(buf, "\n"); return sysfs_emit(buf, "%lu\n", count); } static DEVICE_ATTR_RO(wakeup_expire_count); static ssize_t wakeup_active_show(struct device *dev, struct device_attribute *attr, char *buf) { unsigned int active; bool enabled = false; spin_lock_irq(&dev->power.lock); if (dev->power.wakeup) { active = dev->power.wakeup->active; enabled = true; } spin_unlock_irq(&dev->power.lock); if (!enabled) return sysfs_emit(buf, "\n"); return sysfs_emit(buf, "%u\n", active); } static DEVICE_ATTR_RO(wakeup_active); static ssize_t wakeup_total_time_ms_show(struct device *dev, struct device_attribute *attr, char *buf) { s64 msec; bool enabled = false; spin_lock_irq(&dev->power.lock); if (dev->power.wakeup) { msec = ktime_to_ms(dev->power.wakeup->total_time); enabled = true; } spin_unlock_irq(&dev->power.lock); if (!enabled) return sysfs_emit(buf, "\n"); return sysfs_emit(buf, "%lld\n", msec); } static DEVICE_ATTR_RO(wakeup_total_time_ms); static ssize_t wakeup_max_time_ms_show(struct device *dev, struct device_attribute *attr, char *buf) { s64 msec; bool enabled = false; spin_lock_irq(&dev->power.lock); if (dev->power.wakeup) { msec = ktime_to_ms(dev->power.wakeup->max_time); enabled = true; } spin_unlock_irq(&dev->power.lock); if (!enabled) return sysfs_emit(buf, "\n"); return sysfs_emit(buf, "%lld\n", msec); } static DEVICE_ATTR_RO(wakeup_max_time_ms); static ssize_t wakeup_last_time_ms_show(struct device *dev, struct device_attribute *attr, char *buf) { s64 msec; bool enabled = false; spin_lock_irq(&dev->power.lock); if (dev->power.wakeup) { msec = ktime_to_ms(dev->power.wakeup->last_time); enabled = true; } spin_unlock_irq(&dev->power.lock); if (!enabled) return sysfs_emit(buf, "\n"); return sysfs_emit(buf, "%lld\n", msec); } static DEVICE_ATTR_RO(wakeup_last_time_ms); #ifdef CONFIG_PM_AUTOSLEEP static ssize_t wakeup_prevent_sleep_time_ms_show(struct device *dev, struct device_attribute *attr, char *buf) { s64 msec; bool enabled = false; spin_lock_irq(&dev->power.lock); if (dev->power.wakeup) { msec = ktime_to_ms(dev->power.wakeup->prevent_sleep_time); enabled = true; } spin_unlock_irq(&dev->power.lock); if (!enabled) return sysfs_emit(buf, "\n"); return sysfs_emit(buf, "%lld\n", msec); } static DEVICE_ATTR_RO(wakeup_prevent_sleep_time_ms); #endif /* CONFIG_PM_AUTOSLEEP */ static inline int dpm_sysfs_wakeup_change_owner(struct device *dev, kuid_t kuid, kgid_t kgid) { if (dev->power.wakeup && dev->power.wakeup->dev) return device_change_owner(dev->power.wakeup->dev, kuid, kgid); return 0; } #else /* CONFIG_PM_SLEEP */ static inline int dpm_sysfs_wakeup_change_owner(struct device *dev, kuid_t kuid, kgid_t kgid) { return 0; } #endif #ifdef CONFIG_PM_ADVANCED_DEBUG static ssize_t runtime_usage_show(struct device *dev, struct device_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", atomic_read(&dev->power.usage_count)); } static DEVICE_ATTR_RO(runtime_usage); static ssize_t runtime_active_kids_show(struct device *dev, struct device_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", dev->power.ignore_children ? 0 : atomic_read(&dev->power.child_count)); } static DEVICE_ATTR_RO(runtime_active_kids); static ssize_t runtime_enabled_show(struct device *dev, struct device_attribute *attr, char *buf) { const char *output; if (dev->power.disable_depth && !dev->power.runtime_auto) output = "disabled & forbidden"; else if (dev->power.disable_depth) output = "disabled"; else if (!dev->power.runtime_auto) output = "forbidden"; else output = "enabled"; return sysfs_emit(buf, "%s\n", output); } static DEVICE_ATTR_RO(runtime_enabled); #ifdef CONFIG_PM_SLEEP static ssize_t async_show(struct device *dev, struct device_attribute *attr, char *buf) { return sysfs_emit(buf, "%s\n", device_async_suspend_enabled(dev) ? _enabled : _disabled); } static ssize_t async_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t n) { if (sysfs_streq(buf, _enabled)) device_enable_async_suspend(dev); else if (sysfs_streq(buf, _disabled)) device_disable_async_suspend(dev); else return -EINVAL; return n; } static DEVICE_ATTR_RW(async); #endif /* CONFIG_PM_SLEEP */ #endif /* CONFIG_PM_ADVANCED_DEBUG */ static struct attribute *power_attrs[] = { #if defined(CONFIG_PM_ADVANCED_DEBUG) && defined(CONFIG_PM_SLEEP) &dev_attr_async.attr, #endif NULL, }; static const struct attribute_group pm_attr_group = { .name = power_group_name, .attrs = power_attrs, }; static struct attribute *wakeup_attrs[] = { #ifdef CONFIG_PM_SLEEP &dev_attr_wakeup.attr, &dev_attr_wakeup_count.attr, &dev_attr_wakeup_active_count.attr, &dev_attr_wakeup_abort_count.attr, &dev_attr_wakeup_expire_count.attr, &dev_attr_wakeup_active.attr, &dev_attr_wakeup_total_time_ms.attr, &dev_attr_wakeup_max_time_ms.attr, &dev_attr_wakeup_last_time_ms.attr, #ifdef CONFIG_PM_AUTOSLEEP &dev_attr_wakeup_prevent_sleep_time_ms.attr, #endif #endif NULL, }; static const struct attribute_group pm_wakeup_attr_group = { .name = power_group_name, .attrs = wakeup_attrs, }; static struct attribute *runtime_attrs[] = { &dev_attr_runtime_status.attr, &dev_attr_control.attr, &dev_attr_runtime_suspended_time.attr, &dev_attr_runtime_active_time.attr, &dev_attr_autosuspend_delay_ms.attr, #ifdef CONFIG_PM_ADVANCED_DEBUG &dev_attr_runtime_usage.attr, &dev_attr_runtime_active_kids.attr, &dev_attr_runtime_enabled.attr, #endif NULL, }; static const struct attribute_group pm_runtime_attr_group = { .name = power_group_name, .attrs = runtime_attrs, }; static struct attribute *pm_qos_resume_latency_attrs[] = { &dev_attr_pm_qos_resume_latency_us.attr, NULL, }; static const struct attribute_group pm_qos_resume_latency_attr_group = { .name = power_group_name, .attrs = pm_qos_resume_latency_attrs, }; static struct attribute *pm_qos_latency_tolerance_attrs[] = { &dev_attr_pm_qos_latency_tolerance_us.attr, NULL, }; static const struct attribute_group pm_qos_latency_tolerance_attr_group = { .name = power_group_name, .attrs = pm_qos_latency_tolerance_attrs, }; static struct attribute *pm_qos_flags_attrs[] = { &dev_attr_pm_qos_no_power_off.attr, NULL, }; static const struct attribute_group pm_qos_flags_attr_group = { .name = power_group_name, .attrs = pm_qos_flags_attrs, }; int dpm_sysfs_add(struct device *dev) { int rc; /* No need to create PM sysfs if explicitly disabled. */ if (device_pm_not_required(dev)) return 0; rc = sysfs_create_group(&dev->kobj, &pm_attr_group); if (rc) return rc; if (!pm_runtime_has_no_callbacks(dev)) { rc = sysfs_merge_group(&dev->kobj, &pm_runtime_attr_group); if (rc) goto err_out; } if (device_can_wakeup(dev)) { rc = sysfs_merge_group(&dev->kobj, &pm_wakeup_attr_group); if (rc) goto err_runtime; } if (dev->power.set_latency_tolerance) { rc = sysfs_merge_group(&dev->kobj, &pm_qos_latency_tolerance_attr_group); if (rc) goto err_wakeup; } rc = pm_wakeup_source_sysfs_add(dev); if (rc) goto err_latency; return 0; err_latency: sysfs_unmerge_group(&dev->kobj, &pm_qos_latency_tolerance_attr_group); err_wakeup: sysfs_unmerge_group(&dev->kobj, &pm_wakeup_attr_group); err_runtime: sysfs_unmerge_group(&dev->kobj, &pm_runtime_attr_group); err_out: sysfs_remove_group(&dev->kobj, &pm_attr_group); return rc; } int dpm_sysfs_change_owner(struct device *dev, kuid_t kuid, kgid_t kgid) { int rc; if (device_pm_not_required(dev)) return 0; rc = sysfs_group_change_owner(&dev->kobj, &pm_attr_group, kuid, kgid); if (rc) return rc; if (!pm_runtime_has_no_callbacks(dev)) { rc = sysfs_group_change_owner( &dev->kobj, &pm_runtime_attr_group, kuid, kgid); if (rc) return rc; } if (device_can_wakeup(dev)) { rc = sysfs_group_change_owner(&dev->kobj, &pm_wakeup_attr_group, kuid, kgid); if (rc) return rc; rc = dpm_sysfs_wakeup_change_owner(dev, kuid, kgid); if (rc) return rc; } if (dev->power.set_latency_tolerance) { rc = sysfs_group_change_owner( &dev->kobj, &pm_qos_latency_tolerance_attr_group, kuid, kgid); if (rc) return rc; } return 0; } int wakeup_sysfs_add(struct device *dev) { int ret = sysfs_merge_group(&dev->kobj, &pm_wakeup_attr_group); if (!ret) kobject_uevent(&dev->kobj, KOBJ_CHANGE); return ret; } void wakeup_sysfs_remove(struct device *dev) { sysfs_unmerge_group(&dev->kobj, &pm_wakeup_attr_group); kobject_uevent(&dev->kobj, KOBJ_CHANGE); } int pm_qos_sysfs_add_resume_latency(struct device *dev) { return sysfs_merge_group(&dev->kobj, &pm_qos_resume_latency_attr_group); } void pm_qos_sysfs_remove_resume_latency(struct device *dev) { sysfs_unmerge_group(&dev->kobj, &pm_qos_resume_latency_attr_group); } int pm_qos_sysfs_add_flags(struct device *dev) { return sysfs_merge_group(&dev->kobj, &pm_qos_flags_attr_group); } void pm_qos_sysfs_remove_flags(struct device *dev) { sysfs_unmerge_group(&dev->kobj, &pm_qos_flags_attr_group); } int pm_qos_sysfs_add_latency_tolerance(struct device *dev) { return sysfs_merge_group(&dev->kobj, &pm_qos_latency_tolerance_attr_group); } void pm_qos_sysfs_remove_latency_tolerance(struct device *dev) { sysfs_unmerge_group(&dev->kobj, &pm_qos_latency_tolerance_attr_group); } void rpm_sysfs_remove(struct device *dev) { sysfs_unmerge_group(&dev->kobj, &pm_runtime_attr_group); } void dpm_sysfs_remove(struct device *dev) { if (device_pm_not_required(dev)) return; sysfs_unmerge_group(&dev->kobj, &pm_qos_latency_tolerance_attr_group); dev_pm_qos_constraints_destroy(dev); rpm_sysfs_remove(dev); sysfs_unmerge_group(&dev->kobj, &pm_wakeup_attr_group); sysfs_remove_group(&dev->kobj, &pm_attr_group); }
2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux I2C core * * Copyright (C) 1995-99 Simon G. Vogl * With some changes from Kyösti Mälkki <kmalkki@cc.hut.fi> * Mux support by Rodolfo Giometti <giometti@enneenne.com> and * Michael Lawnick <michael.lawnick.ext@nsn.com> * * Copyright (C) 2013-2017 Wolfram Sang <wsa@kernel.org> */ #define pr_fmt(fmt) "i2c-core: " fmt #include <dt-bindings/i2c/i2c.h> #include <linux/acpi.h> #include <linux/clk/clk-conf.h> #include <linux/completion.h> #include <linux/debugfs.h> #include <linux/delay.h> #include <linux/err.h> #include <linux/errno.h> #include <linux/gpio/consumer.h> #include <linux/i2c.h> #include <linux/i2c-smbus.h> #include <linux/idr.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/irq.h> #include <linux/jump_label.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/of_device.h> #include <linux/of.h> #include <linux/pinctrl/consumer.h> #include <linux/pinctrl/devinfo.h> #include <linux/pm_domain.h> #include <linux/pm_runtime.h> #include <linux/pm_wakeirq.h> #include <linux/property.h> #include <linux/rwsem.h> #include <linux/slab.h> #include <linux/string_choices.h> #include "i2c-core.h" #define CREATE_TRACE_POINTS #include <trace/events/i2c.h> #define I2C_ADDR_OFFSET_TEN_BIT 0xa000 #define I2C_ADDR_OFFSET_SLAVE 0x1000 #define I2C_ADDR_7BITS_MAX 0x77 #define I2C_ADDR_7BITS_COUNT (I2C_ADDR_7BITS_MAX + 1) #define I2C_ADDR_DEVICE_ID 0x7c /* * core_lock protects i2c_adapter_idr, and guarantees that device detection, * deletion of detected devices are serialized */ static DEFINE_MUTEX(core_lock); static DEFINE_IDR(i2c_adapter_idr); static int i2c_detect(struct i2c_adapter *adapter, struct i2c_driver *driver); static DEFINE_STATIC_KEY_FALSE(i2c_trace_msg_key); static bool is_registered; static struct dentry *i2c_debugfs_root; int i2c_transfer_trace_reg(void) { static_branch_inc(&i2c_trace_msg_key); return 0; } void i2c_transfer_trace_unreg(void) { static_branch_dec(&i2c_trace_msg_key); } const char *i2c_freq_mode_string(u32 bus_freq_hz) { switch (bus_freq_hz) { case I2C_MAX_STANDARD_MODE_FREQ: return "Standard Mode (100 kHz)"; case I2C_MAX_FAST_MODE_FREQ: return "Fast Mode (400 kHz)"; case I2C_MAX_FAST_MODE_PLUS_FREQ: return "Fast Mode Plus (1.0 MHz)"; case I2C_MAX_TURBO_MODE_FREQ: return "Turbo Mode (1.4 MHz)"; case I2C_MAX_HIGH_SPEED_MODE_FREQ: return "High Speed Mode (3.4 MHz)"; case I2C_MAX_ULTRA_FAST_MODE_FREQ: return "Ultra Fast Mode (5.0 MHz)"; default: return "Unknown Mode"; } } EXPORT_SYMBOL_GPL(i2c_freq_mode_string); const struct i2c_device_id *i2c_match_id(const struct i2c_device_id *id, const struct i2c_client *client) { if (!(id && client)) return NULL; while (id->name[0]) { if (strcmp(client->name, id->name) == 0) return id; id++; } return NULL; } EXPORT_SYMBOL_GPL(i2c_match_id); const void *i2c_get_match_data(const struct i2c_client *client) { struct i2c_driver *driver = to_i2c_driver(client->dev.driver); const struct i2c_device_id *match; const void *data; data = device_get_match_data(&client->dev); if (!data) { match = i2c_match_id(driver->id_table, client); if (!match) return NULL; data = (const void *)match->driver_data; } return data; } EXPORT_SYMBOL(i2c_get_match_data); static int i2c_device_match(struct device *dev, const struct device_driver *drv) { struct i2c_client *client = i2c_verify_client(dev); const struct i2c_driver *driver; /* Attempt an OF style match */ if (i2c_of_match_device(drv->of_match_table, client)) return 1; /* Then ACPI style match */ if (acpi_driver_match_device(dev, drv)) return 1; driver = to_i2c_driver(drv); /* Finally an I2C match */ if (i2c_match_id(driver->id_table, client)) return 1; return 0; } static int i2c_device_uevent(const struct device *dev, struct kobj_uevent_env *env) { const struct i2c_client *client = to_i2c_client(dev); int rc; rc = of_device_uevent_modalias(dev, env); if (rc != -ENODEV) return rc; rc = acpi_device_uevent_modalias(dev, env); if (rc != -ENODEV) return rc; return add_uevent_var(env, "MODALIAS=%s%s", I2C_MODULE_PREFIX, client->name); } /* i2c bus recovery routines */ static int get_scl_gpio_value(struct i2c_adapter *adap) { return gpiod_get_value_cansleep(adap->bus_recovery_info->scl_gpiod); } static void set_scl_gpio_value(struct i2c_adapter *adap, int val) { gpiod_set_value_cansleep(adap->bus_recovery_info->scl_gpiod, val); } static int get_sda_gpio_value(struct i2c_adapter *adap) { return gpiod_get_value_cansleep(adap->bus_recovery_info->sda_gpiod); } static void set_sda_gpio_value(struct i2c_adapter *adap, int val) { gpiod_set_value_cansleep(adap->bus_recovery_info->sda_gpiod, val); } static int i2c_generic_bus_free(struct i2c_adapter *adap) { struct i2c_bus_recovery_info *bri = adap->bus_recovery_info; int ret = -EOPNOTSUPP; if (bri->get_bus_free) ret = bri->get_bus_free(adap); else if (bri->get_sda) ret = bri->get_sda(adap); if (ret < 0) return ret; return ret ? 0 : -EBUSY; } /* * We are generating clock pulses. ndelay() determines durating of clk pulses. * We will generate clock with rate 100 KHz and so duration of both clock levels * is: delay in ns = (10^6 / 100) / 2 */ #define RECOVERY_NDELAY 5000 #define RECOVERY_CLK_CNT 9 int i2c_generic_scl_recovery(struct i2c_adapter *adap) { struct i2c_bus_recovery_info *bri = adap->bus_recovery_info; int i = 0, scl = 1, ret = 0; if (bri->prepare_recovery) bri->prepare_recovery(adap); if (bri->pinctrl) pinctrl_select_state(bri->pinctrl, bri->pins_gpio); /* * If we can set SDA, we will always create a STOP to ensure additional * pulses will do no harm. This is achieved by letting SDA follow SCL * half a cycle later. Check the 'incomplete_write_byte' fault injector * for details. Note that we must honour tsu:sto, 4us, but lets use 5us * here for simplicity. */ bri->set_scl(adap, scl); ndelay(RECOVERY_NDELAY); if (bri->set_sda) bri->set_sda(adap, scl); ndelay(RECOVERY_NDELAY / 2); /* * By this time SCL is high, as we need to give 9 falling-rising edges */ while (i++ < RECOVERY_CLK_CNT * 2) { if (scl) { /* SCL shouldn't be low here */ if (!bri->get_scl(adap)) { dev_err(&adap->dev, "SCL is stuck low, exit recovery\n"); ret = -EBUSY; break; } } scl = !scl; bri->set_scl(adap, scl); /* Creating STOP again, see above */ if (scl) { /* Honour minimum tsu:sto */ ndelay(RECOVERY_NDELAY); } else { /* Honour minimum tf and thd:dat */ ndelay(RECOVERY_NDELAY / 2); } if (bri->set_sda) bri->set_sda(adap, scl); ndelay(RECOVERY_NDELAY / 2); if (scl) { ret = i2c_generic_bus_free(adap); if (ret == 0) break; } } /* If we can't check bus status, assume recovery worked */ if (ret == -EOPNOTSUPP) ret = 0; if (bri->unprepare_recovery) bri->unprepare_recovery(adap); if (bri->pinctrl) pinctrl_select_state(bri->pinctrl, bri->pins_default); return ret; } EXPORT_SYMBOL_GPL(i2c_generic_scl_recovery); int i2c_recover_bus(struct i2c_adapter *adap) { if (!adap->bus_recovery_info) return -EBUSY; dev_dbg(&adap->dev, "Trying i2c bus recovery\n"); return adap->bus_recovery_info->recover_bus(adap); } EXPORT_SYMBOL_GPL(i2c_recover_bus); static void i2c_gpio_init_pinctrl_recovery(struct i2c_adapter *adap) { struct i2c_bus_recovery_info *bri = adap->bus_recovery_info; struct device *dev = &adap->dev; struct pinctrl *p = bri->pinctrl ?: dev_pinctrl(dev->parent); bri->pinctrl = p; /* * we can't change states without pinctrl, so remove the states if * populated */ if (!p) { bri->pins_default = NULL; bri->pins_gpio = NULL; return; } if (!bri->pins_default) { bri->pins_default = pinctrl_lookup_state(p, PINCTRL_STATE_DEFAULT); if (IS_ERR(bri->pins_default)) { dev_dbg(dev, PINCTRL_STATE_DEFAULT " state not found for GPIO recovery\n"); bri->pins_default = NULL; } } if (!bri->pins_gpio) { bri->pins_gpio = pinctrl_lookup_state(p, "gpio"); if (IS_ERR(bri->pins_gpio)) bri->pins_gpio = pinctrl_lookup_state(p, "recovery"); if (IS_ERR(bri->pins_gpio)) { dev_dbg(dev, "no gpio or recovery state found for GPIO recovery\n"); bri->pins_gpio = NULL; } } /* for pinctrl state changes, we need all the information */ if (bri->pins_default && bri->pins_gpio) { dev_info(dev, "using pinctrl states for GPIO recovery"); } else { bri->pinctrl = NULL; bri->pins_default = NULL; bri->pins_gpio = NULL; } } static int i2c_gpio_init_generic_recovery(struct i2c_adapter *adap) { struct i2c_bus_recovery_info *bri = adap->bus_recovery_info; struct device *dev = &adap->dev; struct gpio_desc *gpiod; int ret = 0; /* * don't touch the recovery information if the driver is not using * generic SCL recovery */ if (bri->recover_bus && bri->recover_bus != i2c_generic_scl_recovery) return 0; /* * pins might be taken as GPIO, so we should inform pinctrl about * this and move the state to GPIO */ if (bri->pinctrl) pinctrl_select_state(bri->pinctrl, bri->pins_gpio); /* * if there is incomplete or no recovery information, see if generic * GPIO recovery is available */ if (!bri->scl_gpiod) { gpiod = devm_gpiod_get(dev, "scl", GPIOD_OUT_HIGH_OPEN_DRAIN); if (PTR_ERR(gpiod) == -EPROBE_DEFER) { ret = -EPROBE_DEFER; goto cleanup_pinctrl_state; } if (!IS_ERR(gpiod)) { bri->scl_gpiod = gpiod; bri->recover_bus = i2c_generic_scl_recovery; dev_info(dev, "using generic GPIOs for recovery\n"); } } /* SDA GPIOD line is optional, so we care about DEFER only */ if (!bri->sda_gpiod) { /* * We have SCL. Pull SCL low and wait a bit so that SDA glitches * have no effect. */ gpiod_direction_output(bri->scl_gpiod, 0); udelay(10); gpiod = devm_gpiod_get(dev, "sda", GPIOD_IN); /* Wait a bit in case of a SDA glitch, and then release SCL. */ udelay(10); gpiod_direction_output(bri->scl_gpiod, 1); if (PTR_ERR(gpiod) == -EPROBE_DEFER) { ret = -EPROBE_DEFER; goto cleanup_pinctrl_state; } if (!IS_ERR(gpiod)) bri->sda_gpiod = gpiod; } cleanup_pinctrl_state: /* change the state of the pins back to their default state */ if (bri->pinctrl) pinctrl_select_state(bri->pinctrl, bri->pins_default); return ret; } static int i2c_gpio_init_recovery(struct i2c_adapter *adap) { i2c_gpio_init_pinctrl_recovery(adap); return i2c_gpio_init_generic_recovery(adap); } static int i2c_init_recovery(struct i2c_adapter *adap) { struct i2c_bus_recovery_info *bri = adap->bus_recovery_info; bool is_error_level = true; char *err_str; if (!bri) return 0; if (i2c_gpio_init_recovery(adap) == -EPROBE_DEFER) return -EPROBE_DEFER; if (!bri->recover_bus) { err_str = "no suitable method provided"; is_error_level = false; goto err; } if (bri->scl_gpiod && bri->recover_bus == i2c_generic_scl_recovery) { bri->get_scl = get_scl_gpio_value; bri->set_scl = set_scl_gpio_value; if (bri->sda_gpiod) { bri->get_sda = get_sda_gpio_value; /* FIXME: add proper flag instead of '0' once available */ if (gpiod_get_direction(bri->sda_gpiod) == 0) bri->set_sda = set_sda_gpio_value; } } else if (bri->recover_bus == i2c_generic_scl_recovery) { /* Generic SCL recovery */ if (!bri->set_scl || !bri->get_scl) { err_str = "no {get|set}_scl() found"; goto err; } if (!bri->set_sda && !bri->get_sda) { err_str = "either get_sda() or set_sda() needed"; goto err; } } return 0; err: if (is_error_level) dev_err(&adap->dev, "Not using recovery: %s\n", err_str); else dev_dbg(&adap->dev, "Not using recovery: %s\n", err_str); adap->bus_recovery_info = NULL; return -EINVAL; } static int i2c_smbus_host_notify_to_irq(const struct i2c_client *client) { struct i2c_adapter *adap = client->adapter; unsigned int irq; if (!adap->host_notify_domain) return -ENXIO; if (client->flags & I2C_CLIENT_TEN) return -EINVAL; irq = irq_create_mapping(adap->host_notify_domain, client->addr); return irq > 0 ? irq : -ENXIO; } static int i2c_device_probe(struct device *dev) { struct fwnode_handle *fwnode = dev_fwnode(dev); struct i2c_client *client = i2c_verify_client(dev); struct i2c_driver *driver; bool do_power_on; int status; if (!client) return 0; client->irq = client->init_irq; if (!client->irq) { int irq = -ENOENT; if (client->flags & I2C_CLIENT_HOST_NOTIFY) { dev_dbg(dev, "Using Host Notify IRQ\n"); /* Keep adapter active when Host Notify is required */ pm_runtime_get_sync(&client->adapter->dev); irq = i2c_smbus_host_notify_to_irq(client); } else if (is_of_node(fwnode)) { irq = fwnode_irq_get_byname(fwnode, "irq"); if (irq == -EINVAL || irq == -ENODATA) irq = fwnode_irq_get(fwnode, 0); } else if (is_acpi_device_node(fwnode)) { bool wake_capable; irq = i2c_acpi_get_irq(client, &wake_capable); if (irq > 0 && wake_capable) client->flags |= I2C_CLIENT_WAKE; } if (irq == -EPROBE_DEFER) { status = dev_err_probe(dev, irq, "can't get irq\n"); goto put_sync_adapter; } if (irq < 0) irq = 0; client->irq = irq; } driver = to_i2c_driver(dev->driver); /* * An I2C ID table is not mandatory, if and only if, a suitable OF * or ACPI ID table is supplied for the probing device. */ if (!driver->id_table && !acpi_driver_match_device(dev, dev->driver) && !i2c_of_match_device(dev->driver->of_match_table, client)) { status = -ENODEV; goto put_sync_adapter; } if (client->flags & I2C_CLIENT_WAKE) { int wakeirq; wakeirq = fwnode_irq_get_byname(fwnode, "wakeup"); if (wakeirq == -EPROBE_DEFER) { status = dev_err_probe(dev, wakeirq, "can't get wakeirq\n"); goto put_sync_adapter; } device_init_wakeup(&client->dev, true); if (wakeirq > 0 && wakeirq != client->irq) status = dev_pm_set_dedicated_wake_irq(dev, wakeirq); else if (client->irq > 0) status = dev_pm_set_wake_irq(dev, client->irq); else status = 0; if (status) dev_warn(&client->dev, "failed to set up wakeup irq\n"); } dev_dbg(dev, "probe\n"); status = of_clk_set_defaults(to_of_node(fwnode), false); if (status < 0) goto err_clear_wakeup_irq; do_power_on = !i2c_acpi_waive_d0_probe(dev); status = dev_pm_domain_attach(&client->dev, PD_FLAG_DETACH_POWER_OFF | (do_power_on ? PD_FLAG_ATTACH_POWER_ON : 0)); if (status) goto err_clear_wakeup_irq; client->devres_group_id = devres_open_group(&client->dev, NULL, GFP_KERNEL); if (!client->devres_group_id) { status = -ENOMEM; goto err_clear_wakeup_irq; } client->debugfs = debugfs_create_dir(dev_name(&client->dev), client->adapter->debugfs); if (driver->probe) status = driver->probe(client); else status = -EINVAL; /* * Note that we are not closing the devres group opened above so * even resources that were attached to the device after probe is * run are released when i2c_device_remove() is executed. This is * needed as some drivers would allocate additional resources, * for example when updating firmware. */ if (status) goto err_release_driver_resources; return 0; err_release_driver_resources: debugfs_remove_recursive(client->debugfs); devres_release_group(&client->dev, client->devres_group_id); err_clear_wakeup_irq: dev_pm_clear_wake_irq(&client->dev); device_init_wakeup(&client->dev, false); put_sync_adapter: if (client->flags & I2C_CLIENT_HOST_NOTIFY) pm_runtime_put_sync(&client->adapter->dev); return status; } static void i2c_device_remove(struct device *dev) { struct i2c_client *client = to_i2c_client(dev); struct i2c_driver *driver; driver = to_i2c_driver(dev->driver); if (driver->remove) { dev_dbg(dev, "remove\n"); driver->remove(client); } debugfs_remove_recursive(client->debugfs); devres_release_group(&client->dev, client->devres_group_id); dev_pm_clear_wake_irq(&client->dev); device_init_wakeup(&client->dev, false); client->irq = 0; if (client->flags & I2C_CLIENT_HOST_NOTIFY) pm_runtime_put(&client->adapter->dev); } static void i2c_device_shutdown(struct device *dev) { struct i2c_client *client = i2c_verify_client(dev); struct i2c_driver *driver; if (!client || !dev->driver) return; driver = to_i2c_driver(dev->driver); if (driver->shutdown) driver->shutdown(client); else if (client->irq > 0) disable_irq(client->irq); } static void i2c_client_dev_release(struct device *dev) { kfree(to_i2c_client(dev)); } static ssize_t name_show(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "%s\n", dev->type == &i2c_client_type ? to_i2c_client(dev)->name : to_i2c_adapter(dev)->name); } static DEVICE_ATTR_RO(name); static ssize_t modalias_show(struct device *dev, struct device_attribute *attr, char *buf) { struct i2c_client *client = to_i2c_client(dev); int len; len = of_device_modalias(dev, buf, PAGE_SIZE); if (len != -ENODEV) return len; len = acpi_device_modalias(dev, buf, PAGE_SIZE - 1); if (len != -ENODEV) return len; return sprintf(buf, "%s%s\n", I2C_MODULE_PREFIX, client->name); } static DEVICE_ATTR_RO(modalias); static struct attribute *i2c_dev_attrs[] = { &dev_attr_name.attr, /* modalias helps coldplug: modprobe $(cat .../modalias) */ &dev_attr_modalias.attr, NULL }; ATTRIBUTE_GROUPS(i2c_dev); const struct bus_type i2c_bus_type = { .name = "i2c", .match = i2c_device_match, .probe = i2c_device_probe, .remove = i2c_device_remove, .shutdown = i2c_device_shutdown, }; EXPORT_SYMBOL_GPL(i2c_bus_type); const struct device_type i2c_client_type = { .groups = i2c_dev_groups, .uevent = i2c_device_uevent, .release = i2c_client_dev_release, }; EXPORT_SYMBOL_GPL(i2c_client_type); /** * i2c_verify_client - return parameter as i2c_client, or NULL * @dev: device, probably from some driver model iterator * * When traversing the driver model tree, perhaps using driver model * iterators like @device_for_each_child(), you can't assume very much * about the nodes you find. Use this function to avoid oopses caused * by wrongly treating some non-I2C device as an i2c_client. */ struct i2c_client *i2c_verify_client(struct device *dev) { return (dev->type == &i2c_client_type) ? to_i2c_client(dev) : NULL; } EXPORT_SYMBOL(i2c_verify_client); /* Return a unique address which takes the flags of the client into account */ static unsigned short i2c_encode_flags_to_addr(struct i2c_client *client) { unsigned short addr = client->addr; /* For some client flags, add an arbitrary offset to avoid collisions */ if (client->flags & I2C_CLIENT_TEN) addr |= I2C_ADDR_OFFSET_TEN_BIT; if (client->flags & I2C_CLIENT_SLAVE) addr |= I2C_ADDR_OFFSET_SLAVE; return addr; } /* This is a permissive address validity check, I2C address map constraints * are purposely not enforced, except for the general call address. */ static int i2c_check_addr_validity(unsigned int addr, unsigned short flags) { if (flags & I2C_CLIENT_TEN) { /* 10-bit address, all values are valid */ if (addr > 0x3ff) return -EINVAL; } else { /* 7-bit address, reject the general call address */ if (addr == 0x00 || addr > 0x7f) return -EINVAL; } return 0; } /* And this is a strict address validity check, used when probing. If a * device uses a reserved address, then it shouldn't be probed. 7-bit * addressing is assumed, 10-bit address devices are rare and should be * explicitly enumerated. */ int i2c_check_7bit_addr_validity_strict(unsigned short addr) { /* * Reserved addresses per I2C specification: * 0x00 General call address / START byte * 0x01 CBUS address * 0x02 Reserved for different bus format * 0x03 Reserved for future purposes * 0x04-0x07 Hs-mode master code * 0x78-0x7b 10-bit slave addressing * 0x7c-0x7f Reserved for future purposes */ if (addr < 0x08 || addr > 0x77) return -EINVAL; return 0; } static int __i2c_check_addr_busy(struct device *dev, void *addrp) { struct i2c_client *client = i2c_verify_client(dev); int addr = *(int *)addrp; if (client && i2c_encode_flags_to_addr(client) == addr) return -EBUSY; return 0; } /* walk up mux tree */ static int i2c_check_mux_parents(struct i2c_adapter *adapter, int addr) { struct i2c_adapter *parent = i2c_parent_is_i2c_adapter(adapter); int result; result = device_for_each_child(&adapter->dev, &addr, __i2c_check_addr_busy); if (!result && parent) result = i2c_check_mux_parents(parent, addr); return result; } /* recurse down mux tree */ static int i2c_check_mux_children(struct device *dev, void *addrp) { int result; if (dev->type == &i2c_adapter_type) result = device_for_each_child(dev, addrp, i2c_check_mux_children); else result = __i2c_check_addr_busy(dev, addrp); return result; } static int i2c_check_addr_busy(struct i2c_adapter *adapter, int addr) { struct i2c_adapter *parent = i2c_parent_is_i2c_adapter(adapter); int result = 0; if (parent) result = i2c_check_mux_parents(parent, addr); if (!result) result = device_for_each_child(&adapter->dev, &addr, i2c_check_mux_children); return result; } /** * i2c_adapter_lock_bus - Get exclusive access to an I2C bus segment * @adapter: Target I2C bus segment * @flags: I2C_LOCK_ROOT_ADAPTER locks the root i2c adapter, I2C_LOCK_SEGMENT * locks only this branch in the adapter tree */ static void i2c_adapter_lock_bus(struct i2c_adapter *adapter, unsigned int flags) { rt_mutex_lock_nested(&adapter->bus_lock, i2c_adapter_depth(adapter)); } /** * i2c_adapter_trylock_bus - Try to get exclusive access to an I2C bus segment * @adapter: Target I2C bus segment * @flags: I2C_LOCK_ROOT_ADAPTER trylocks the root i2c adapter, I2C_LOCK_SEGMENT * trylocks only this branch in the adapter tree */ static int i2c_adapter_trylock_bus(struct i2c_adapter *adapter, unsigned int flags) { return rt_mutex_trylock(&adapter->bus_lock); } /** * i2c_adapter_unlock_bus - Release exclusive access to an I2C bus segment * @adapter: Target I2C bus segment * @flags: I2C_LOCK_ROOT_ADAPTER unlocks the root i2c adapter, I2C_LOCK_SEGMENT * unlocks only this branch in the adapter tree */ static void i2c_adapter_unlock_bus(struct i2c_adapter *adapter, unsigned int flags) { rt_mutex_unlock(&adapter->bus_lock); } static void i2c_dev_set_name(struct i2c_adapter *adap, struct i2c_client *client, struct i2c_board_info const *info) { struct acpi_device *adev = ACPI_COMPANION(&client->dev); if (info && info->dev_name) { dev_set_name(&client->dev, "i2c-%s", info->dev_name); return; } if (adev) { dev_set_name(&client->dev, "i2c-%s", acpi_dev_name(adev)); return; } dev_set_name(&client->dev, "%d-%04x", i2c_adapter_id(adap), i2c_encode_flags_to_addr(client)); } int i2c_dev_irq_from_resources(const struct resource *resources, unsigned int num_resources) { struct irq_data *irqd; int i; for (i = 0; i < num_resources; i++) { const struct resource *r = &resources[i]; if (resource_type(r) != IORESOURCE_IRQ) continue; if (r->flags & IORESOURCE_BITS) { irqd = irq_get_irq_data(r->start); if (!irqd) break; irqd_set_trigger_type(irqd, r->flags & IORESOURCE_BITS); } return r->start; } return 0; } /* * Serialize device instantiation in case it can be instantiated explicitly * and by auto-detection */ static int i2c_lock_addr(struct i2c_adapter *adap, unsigned short addr, unsigned short flags) { if (!(flags & I2C_CLIENT_TEN) && test_and_set_bit(addr, adap->addrs_in_instantiation)) return -EBUSY; return 0; } static void i2c_unlock_addr(struct i2c_adapter *adap, unsigned short addr, unsigned short flags) { if (!(flags & I2C_CLIENT_TEN)) clear_bit(addr, adap->addrs_in_instantiation); } /** * i2c_new_client_device - instantiate an i2c device * @adap: the adapter managing the device * @info: describes one I2C device; bus_num is ignored * Context: can sleep * * Create an i2c device. Binding is handled through driver model * probe()/remove() methods. A driver may be bound to this device when we * return from this function, or any later moment (e.g. maybe hotplugging will * load the driver module). This call is not appropriate for use by mainboard * initialization logic, which usually runs during an arch_initcall() long * before any i2c_adapter could exist. * * This returns the new i2c client, which may be saved for later use with * i2c_unregister_device(); or an ERR_PTR to describe the error. */ struct i2c_client * i2c_new_client_device(struct i2c_adapter *adap, struct i2c_board_info const *info) { struct fwnode_handle *fwnode = info->fwnode; struct i2c_client *client; bool need_put = false; int status; client = kzalloc(sizeof *client, GFP_KERNEL); if (!client) return ERR_PTR(-ENOMEM); client->adapter = adap; client->dev.platform_data = info->platform_data; client->flags = info->flags; client->addr = info->addr; client->init_irq = info->irq; if (!client->init_irq) client->init_irq = i2c_dev_irq_from_resources(info->resources, info->num_resources); strscpy(client->name, info->type, sizeof(client->name)); status = i2c_check_addr_validity(client->addr, client->flags); if (status) { dev_err(&adap->dev, "Invalid %d-bit I2C address 0x%02hx\n", client->flags & I2C_CLIENT_TEN ? 10 : 7, client->addr); goto out_err_silent; } status = i2c_lock_addr(adap, client->addr, client->flags); if (status) goto out_err_silent; /* Check for address business */ status = i2c_check_addr_busy(adap, i2c_encode_flags_to_addr(client)); if (status) goto out_err; client->dev.parent = &client->adapter->dev; client->dev.bus = &i2c_bus_type; client->dev.type = &i2c_client_type; device_enable_async_suspend(&client->dev); device_set_node(&client->dev, fwnode_handle_get(fwnode)); if (info->swnode) { status = device_add_software_node(&client->dev, info->swnode); if (status) { dev_err(&adap->dev, "Failed to add software node to client %s: %d\n", client->name, status); goto out_err_put_fwnode; } } i2c_dev_set_name(adap, client, info); status = device_register(&client->dev); if (status) goto out_remove_swnode; dev_dbg(&adap->dev, "client [%s] registered with bus id %s\n", client->name, dev_name(&client->dev)); i2c_unlock_addr(adap, client->addr, client->flags); return client; out_remove_swnode: device_remove_software_node(&client->dev); need_put = true; out_err_put_fwnode: fwnode_handle_put(fwnode); out_err: dev_err(&adap->dev, "Failed to register i2c client %s at 0x%02x (%d)\n", client->name, client->addr, status); i2c_unlock_addr(adap, client->addr, client->flags); out_err_silent: if (need_put) put_device(&client->dev); else kfree(client); return ERR_PTR(status); } EXPORT_SYMBOL_GPL(i2c_new_client_device); /** * i2c_unregister_device - reverse effect of i2c_new_*_device() * @client: value returned from i2c_new_*_device() * Context: can sleep */ void i2c_unregister_device(struct i2c_client *client) { struct fwnode_handle *fwnode; if (IS_ERR_OR_NULL(client)) return; fwnode = dev_fwnode(&client->dev); if (is_of_node(fwnode)) of_node_clear_flag(to_of_node(fwnode), OF_POPULATED); else if (is_acpi_device_node(fwnode)) acpi_device_clear_enumerated(to_acpi_device_node(fwnode)); /* * If the primary fwnode is a software node it is free-ed by * device_remove_software_node() below, avoid double-free. */ if (!is_software_node(fwnode)) fwnode_handle_put(fwnode); device_remove_software_node(&client->dev); device_unregister(&client->dev); } EXPORT_SYMBOL_GPL(i2c_unregister_device); /** * i2c_find_device_by_fwnode() - find an i2c_client for the fwnode * @fwnode: &struct fwnode_handle corresponding to the &struct i2c_client * * Look up and return the &struct i2c_client corresponding to the @fwnode. * If no client can be found, or @fwnode is NULL, this returns NULL. * * The user must call put_device(&client->dev) once done with the i2c client. */ struct i2c_client *i2c_find_device_by_fwnode(struct fwnode_handle *fwnode) { struct i2c_client *client; struct device *dev; if (!fwnode) return NULL; dev = bus_find_device_by_fwnode(&i2c_bus_type, fwnode); if (!dev) return NULL; client = i2c_verify_client(dev); if (!client) put_device(dev); return client; } EXPORT_SYMBOL(i2c_find_device_by_fwnode); static const struct i2c_device_id dummy_id[] = { { "dummy", }, { "smbus_host_notify", }, { } }; static int dummy_probe(struct i2c_client *client) { return 0; } static struct i2c_driver dummy_driver = { .driver.name = "dummy", .probe = dummy_probe, .id_table = dummy_id, }; /** * i2c_new_dummy_device - return a new i2c device bound to a dummy driver * @adapter: the adapter managing the device * @address: seven bit address to be used * Context: can sleep * * This returns an I2C client bound to the "dummy" driver, intended for use * with devices that consume multiple addresses. Examples of such chips * include various EEPROMS (like 24c04 and 24c08 models). * * These dummy devices have two main uses. First, most I2C and SMBus calls * except i2c_transfer() need a client handle; the dummy will be that handle. * And second, this prevents the specified address from being bound to a * different driver. * * This returns the new i2c client, which should be saved for later use with * i2c_unregister_device(); or an ERR_PTR to describe the error. */ struct i2c_client *i2c_new_dummy_device(struct i2c_adapter *adapter, u16 address) { struct i2c_board_info info = { I2C_BOARD_INFO("dummy", address), }; return i2c_new_client_device(adapter, &info); } EXPORT_SYMBOL_GPL(i2c_new_dummy_device); static void devm_i2c_release_dummy(void *client) { i2c_unregister_device(client); } /** * devm_i2c_new_dummy_device - return a new i2c device bound to a dummy driver * @dev: device the managed resource is bound to * @adapter: the adapter managing the device * @address: seven bit address to be used * Context: can sleep * * This is the device-managed version of @i2c_new_dummy_device. It returns the * new i2c client or an ERR_PTR in case of an error. */ struct i2c_client *devm_i2c_new_dummy_device(struct device *dev, struct i2c_adapter *adapter, u16 address) { struct i2c_client *client; int ret; client = i2c_new_dummy_device(adapter, address); if (IS_ERR(client)) return client; ret = devm_add_action_or_reset(dev, devm_i2c_release_dummy, client); if (ret) return ERR_PTR(ret); return client; } EXPORT_SYMBOL_GPL(devm_i2c_new_dummy_device); /** * i2c_new_ancillary_device - Helper to get the instantiated secondary address * and create the associated device * @client: Handle to the primary client * @name: Handle to specify which secondary address to get * @default_addr: Used as a fallback if no secondary address was specified * Context: can sleep * * I2C clients can be composed of multiple I2C slaves bound together in a single * component. The I2C client driver then binds to the master I2C slave and needs * to create I2C dummy clients to communicate with all the other slaves. * * This function creates and returns an I2C dummy client whose I2C address is * retrieved from the platform firmware based on the given slave name. If no * address is specified by the firmware default_addr is used. * * On DT-based platforms the address is retrieved from the "reg" property entry * cell whose "reg-names" value matches the slave name. * * This returns the new i2c client, which should be saved for later use with * i2c_unregister_device(); or an ERR_PTR to describe the error. */ struct i2c_client *i2c_new_ancillary_device(struct i2c_client *client, const char *name, u16 default_addr) { struct device_node *np = client->dev.of_node; u32 addr = default_addr; int i; i = of_property_match_string(np, "reg-names", name); if (i >= 0) of_property_read_u32_index(np, "reg", i, &addr); dev_dbg(&client->adapter->dev, "Address for %s : 0x%x\n", name, addr); return i2c_new_dummy_device(client->adapter, addr); } EXPORT_SYMBOL_GPL(i2c_new_ancillary_device); /* ------------------------------------------------------------------------- */ /* I2C bus adapters -- one roots each I2C or SMBUS segment */ static void i2c_adapter_dev_release(struct device *dev) { struct i2c_adapter *adap = to_i2c_adapter(dev); complete(&adap->dev_released); } unsigned int i2c_adapter_depth(struct i2c_adapter *adapter) { unsigned int depth = 0; struct device *parent; for (parent = adapter->dev.parent; parent; parent = parent->parent) if (parent->type == &i2c_adapter_type) depth++; WARN_ONCE(depth >= MAX_LOCKDEP_SUBCLASSES, "adapter depth exceeds lockdep subclass limit\n"); return depth; } EXPORT_SYMBOL_GPL(i2c_adapter_depth); /* * Let users instantiate I2C devices through sysfs. This can be used when * platform initialization code doesn't contain the proper data for * whatever reason. Also useful for drivers that do device detection and * detection fails, either because the device uses an unexpected address, * or this is a compatible device with different ID register values. * * Parameter checking may look overzealous, but we really don't want * the user to provide incorrect parameters. */ static ssize_t new_device_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct i2c_adapter *adap = to_i2c_adapter(dev); struct i2c_board_info info; struct i2c_client *client; char *blank, end; int res; memset(&info, 0, sizeof(struct i2c_board_info)); blank = strchr(buf, ' '); if (!blank) { dev_err(dev, "%s: Missing parameters\n", "new_device"); return -EINVAL; } if (blank - buf > I2C_NAME_SIZE - 1) { dev_err(dev, "%s: Invalid device name\n", "new_device"); return -EINVAL; } memcpy(info.type, buf, blank - buf); /* Parse remaining parameters, reject extra parameters */ res = sscanf(++blank, "%hi%c", &info.addr, &end); if (res < 1) { dev_err(dev, "%s: Can't parse I2C address\n", "new_device"); return -EINVAL; } if (res > 1 && end != '\n') { dev_err(dev, "%s: Extra parameters\n", "new_device"); return -EINVAL; } if ((info.addr & I2C_ADDR_OFFSET_TEN_BIT) == I2C_ADDR_OFFSET_TEN_BIT) { info.addr &= ~I2C_ADDR_OFFSET_TEN_BIT; info.flags |= I2C_CLIENT_TEN; } if (info.addr & I2C_ADDR_OFFSET_SLAVE) { info.addr &= ~I2C_ADDR_OFFSET_SLAVE; info.flags |= I2C_CLIENT_SLAVE; } client = i2c_new_client_device(adap, &info); if (IS_ERR(client)) return PTR_ERR(client); /* Keep track of the added device */ mutex_lock(&adap->userspace_clients_lock); list_add_tail(&client->detected, &adap->userspace_clients); mutex_unlock(&adap->userspace_clients_lock); dev_info(dev, "%s: Instantiated device %s at 0x%02hx\n", "new_device", info.type, info.addr); return count; } static DEVICE_ATTR_WO(new_device); /* * And of course let the users delete the devices they instantiated, if * they got it wrong. This interface can only be used to delete devices * instantiated by i2c_sysfs_new_device above. This guarantees that we * don't delete devices to which some kernel code still has references. * * Parameter checking may look overzealous, but we really don't want * the user to delete the wrong device. */ static ssize_t delete_device_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct i2c_adapter *adap = to_i2c_adapter(dev); struct i2c_client *client, *next; unsigned short addr; char end; int res; /* Parse parameters, reject extra parameters */ res = sscanf(buf, "%hi%c", &addr, &end); if (res < 1) { dev_err(dev, "%s: Can't parse I2C address\n", "delete_device"); return -EINVAL; } if (res > 1 && end != '\n') { dev_err(dev, "%s: Extra parameters\n", "delete_device"); return -EINVAL; } /* Make sure the device was added through sysfs */ res = -ENOENT; mutex_lock_nested(&adap->userspace_clients_lock, i2c_adapter_depth(adap)); list_for_each_entry_safe(client, next, &adap->userspace_clients, detected) { if (i2c_encode_flags_to_addr(client) == addr) { dev_info(dev, "%s: Deleting device %s at 0x%02hx\n", "delete_device", client->name, client->addr); list_del(&client->detected); i2c_unregister_device(client); res = count; break; } } mutex_unlock(&adap->userspace_clients_lock); if (res < 0) dev_err(dev, "%s: Can't find device in list\n", "delete_device"); return res; } static DEVICE_ATTR_IGNORE_LOCKDEP(delete_device, S_IWUSR, NULL, delete_device_store); static struct attribute *i2c_adapter_attrs[] = { &dev_attr_name.attr, &dev_attr_new_device.attr, &dev_attr_delete_device.attr, NULL }; ATTRIBUTE_GROUPS(i2c_adapter); const struct device_type i2c_adapter_type = { .groups = i2c_adapter_groups, .release = i2c_adapter_dev_release, }; EXPORT_SYMBOL_GPL(i2c_adapter_type); /** * i2c_verify_adapter - return parameter as i2c_adapter or NULL * @dev: device, probably from some driver model iterator * * When traversing the driver model tree, perhaps using driver model * iterators like @device_for_each_child(), you can't assume very much * about the nodes you find. Use this function to avoid oopses caused * by wrongly treating some non-I2C device as an i2c_adapter. */ struct i2c_adapter *i2c_verify_adapter(struct device *dev) { return (dev->type == &i2c_adapter_type) ? to_i2c_adapter(dev) : NULL; } EXPORT_SYMBOL(i2c_verify_adapter); static void i2c_scan_static_board_info(struct i2c_adapter *adapter) { struct i2c_devinfo *devinfo; down_read(&__i2c_board_lock); list_for_each_entry(devinfo, &__i2c_board_list, list) { if (devinfo->busnum == adapter->nr && IS_ERR(i2c_new_client_device(adapter, &devinfo->board_info))) dev_err(&adapter->dev, "Can't create device at 0x%02x\n", devinfo->board_info.addr); } up_read(&__i2c_board_lock); } static int i2c_do_add_adapter(struct i2c_driver *driver, struct i2c_adapter *adap) { /* Detect supported devices on that bus, and instantiate them */ i2c_detect(adap, driver); return 0; } static int __process_new_adapter(struct device_driver *d, void *data) { return i2c_do_add_adapter(to_i2c_driver(d), data); } static const struct i2c_lock_operations i2c_adapter_lock_ops = { .lock_bus = i2c_adapter_lock_bus, .trylock_bus = i2c_adapter_trylock_bus, .unlock_bus = i2c_adapter_unlock_bus, }; static void i2c_host_notify_irq_teardown(struct i2c_adapter *adap) { struct irq_domain *domain = adap->host_notify_domain; irq_hw_number_t hwirq; if (!domain) return; for (hwirq = 0 ; hwirq < I2C_ADDR_7BITS_COUNT ; hwirq++) irq_dispose_mapping(irq_find_mapping(domain, hwirq)); irq_domain_remove(domain); adap->host_notify_domain = NULL; } static int i2c_host_notify_irq_map(struct irq_domain *h, unsigned int virq, irq_hw_number_t hw_irq_num) { irq_set_chip_and_handler(virq, &dummy_irq_chip, handle_simple_irq); return 0; } static const struct irq_domain_ops i2c_host_notify_irq_ops = { .map = i2c_host_notify_irq_map, }; static int i2c_setup_host_notify_irq_domain(struct i2c_adapter *adap) { struct irq_domain *domain; if (!i2c_check_functionality(adap, I2C_FUNC_SMBUS_HOST_NOTIFY)) return 0; domain = irq_domain_create_linear(adap->dev.parent->fwnode, I2C_ADDR_7BITS_COUNT, &i2c_host_notify_irq_ops, adap); if (!domain) return -ENOMEM; adap->host_notify_domain = domain; return 0; } /** * i2c_handle_smbus_host_notify - Forward a Host Notify event to the correct * I2C client. * @adap: the adapter * @addr: the I2C address of the notifying device * Context: can't sleep * * Helper function to be called from an I2C bus driver's interrupt * handler. It will schedule the Host Notify IRQ. */ int i2c_handle_smbus_host_notify(struct i2c_adapter *adap, unsigned short addr) { int irq; if (!adap) return -EINVAL; dev_dbg(&adap->dev, "Detected HostNotify from address 0x%02x", addr); irq = irq_find_mapping(adap->host_notify_domain, addr); if (irq <= 0) return -ENXIO; generic_handle_irq_safe(irq); return 0; } EXPORT_SYMBOL_GPL(i2c_handle_smbus_host_notify); static int i2c_register_adapter(struct i2c_adapter *adap) { int res = -EINVAL; /* Can't register until after driver model init */ if (WARN_ON(!is_registered)) { res = -EAGAIN; goto out_list; } /* Sanity checks */ if (WARN(!adap->name[0], "i2c adapter has no name")) goto out_list; if (!adap->algo) { pr_err("adapter '%s': no algo supplied!\n", adap->name); goto out_list; } if (!adap->lock_ops) adap->lock_ops = &i2c_adapter_lock_ops; adap->locked_flags = 0; rt_mutex_init(&adap->bus_lock); rt_mutex_init(&adap->mux_lock); mutex_init(&adap->userspace_clients_lock); INIT_LIST_HEAD(&adap->userspace_clients); /* Set default timeout to 1 second if not already set */ if (adap->timeout == 0) adap->timeout = HZ; /* register soft irqs for Host Notify */ res = i2c_setup_host_notify_irq_domain(adap); if (res) { pr_err("adapter '%s': can't create Host Notify IRQs (%d)\n", adap->name, res); goto out_list; } dev_set_name(&adap->dev, "i2c-%d", adap->nr); adap->dev.bus = &i2c_bus_type; adap->dev.type = &i2c_adapter_type; device_initialize(&adap->dev); /* * This adapter can be used as a parent immediately after device_add(), * setup runtime-pm (especially ignore-children) before hand. */ device_enable_async_suspend(&adap->dev); pm_runtime_no_callbacks(&adap->dev); pm_suspend_ignore_children(&adap->dev, true); pm_runtime_enable(&adap->dev); res = device_add(&adap->dev); if (res) { pr_err("adapter '%s': can't register device (%d)\n", adap->name, res); put_device(&adap->dev); goto out_list; } adap->debugfs = debugfs_create_dir(dev_name(&adap->dev), i2c_debugfs_root); res = i2c_setup_smbus_alert(adap); if (res) goto out_reg; res = i2c_init_recovery(adap); if (res == -EPROBE_DEFER) goto out_reg; dev_dbg(&adap->dev, "adapter [%s] registered\n", adap->name); /* create pre-declared device nodes */ of_i2c_register_devices(adap); i2c_acpi_install_space_handler(adap); i2c_acpi_register_devices(adap); if (adap->nr < __i2c_first_dynamic_bus_num) i2c_scan_static_board_info(adap); /* Notify drivers */ mutex_lock(&core_lock); bus_for_each_drv(&i2c_bus_type, NULL, adap, __process_new_adapter); mutex_unlock(&core_lock); return 0; out_reg: debugfs_remove_recursive(adap->debugfs); init_completion(&adap->dev_released); device_unregister(&adap->dev); wait_for_completion(&adap->dev_released); out_list: mutex_lock(&core_lock); idr_remove(&i2c_adapter_idr, adap->nr); mutex_unlock(&core_lock); return res; } /** * __i2c_add_numbered_adapter - i2c_add_numbered_adapter where nr is never -1 * @adap: the adapter to register (with adap->nr initialized) * Context: can sleep * * See i2c_add_numbered_adapter() for details. */ static int __i2c_add_numbered_adapter(struct i2c_adapter *adap) { int id; mutex_lock(&core_lock); id = idr_alloc(&i2c_adapter_idr, adap, adap->nr, adap->nr + 1, GFP_KERNEL); mutex_unlock(&core_lock); if (WARN(id < 0, "couldn't get idr")) return id == -ENOSPC ? -EBUSY : id; return i2c_register_adapter(adap); } /** * i2c_add_adapter - declare i2c adapter, use dynamic bus number * @adapter: the adapter to add * Context: can sleep * * This routine is used to declare an I2C adapter when its bus number * doesn't matter or when its bus number is specified by an dt alias. * Examples of bases when the bus number doesn't matter: I2C adapters * dynamically added by USB links or PCI plugin cards. * * When this returns zero, a new bus number was allocated and stored * in adap->nr, and the specified adapter became available for clients. * Otherwise, a negative errno value is returned. */ int i2c_add_adapter(struct i2c_adapter *adapter) { struct device *dev = &adapter->dev; int id; id = of_alias_get_id(dev->of_node, "i2c"); if (id >= 0) { adapter->nr = id; return __i2c_add_numbered_adapter(adapter); } mutex_lock(&core_lock); id = idr_alloc(&i2c_adapter_idr, adapter, __i2c_first_dynamic_bus_num, 0, GFP_KERNEL); mutex_unlock(&core_lock); if (WARN(id < 0, "couldn't get idr")) return id; adapter->nr = id; return i2c_register_adapter(adapter); } EXPORT_SYMBOL(i2c_add_adapter); /** * i2c_add_numbered_adapter - declare i2c adapter, use static bus number * @adap: the adapter to register (with adap->nr initialized) * Context: can sleep * * This routine is used to declare an I2C adapter when its bus number * matters. For example, use it for I2C adapters from system-on-chip CPUs, * or otherwise built in to the system's mainboard, and where i2c_board_info * is used to properly configure I2C devices. * * If the requested bus number is set to -1, then this function will behave * identically to i2c_add_adapter, and will dynamically assign a bus number. * * If no devices have pre-been declared for this bus, then be sure to * register the adapter before any dynamically allocated ones. Otherwise * the required bus ID may not be available. * * When this returns zero, the specified adapter became available for * clients using the bus number provided in adap->nr. Also, the table * of I2C devices pre-declared using i2c_register_board_info() is scanned, * and the appropriate driver model device nodes are created. Otherwise, a * negative errno value is returned. */ int i2c_add_numbered_adapter(struct i2c_adapter *adap) { if (adap->nr == -1) /* -1 means dynamically assign bus id */ return i2c_add_adapter(adap); return __i2c_add_numbered_adapter(adap); } EXPORT_SYMBOL_GPL(i2c_add_numbered_adapter); static void i2c_do_del_adapter(struct i2c_driver *driver, struct i2c_adapter *adapter) { struct i2c_client *client, *_n; /* Remove the devices we created ourselves as the result of hardware * probing (using a driver's detect method) */ list_for_each_entry_safe(client, _n, &driver->clients, detected) { if (client->adapter == adapter) { dev_dbg(&adapter->dev, "Removing %s at 0x%x\n", client->name, client->addr); list_del(&client->detected); i2c_unregister_device(client); } } } static int __unregister_client(struct device *dev, void *dummy) { struct i2c_client *client = i2c_verify_client(dev); if (client && strcmp(client->name, "dummy")) i2c_unregister_device(client); return 0; } static int __unregister_dummy(struct device *dev, void *dummy) { struct i2c_client *client = i2c_verify_client(dev); i2c_unregister_device(client); return 0; } static int __process_removed_adapter(struct device_driver *d, void *data) { i2c_do_del_adapter(to_i2c_driver(d), data); return 0; } /** * i2c_del_adapter - unregister I2C adapter * @adap: the adapter being unregistered * Context: can sleep * * This unregisters an I2C adapter which was previously registered * by @i2c_add_adapter or @i2c_add_numbered_adapter. */ void i2c_del_adapter(struct i2c_adapter *adap) { struct i2c_adapter *found; struct i2c_client *client, *next; /* First make sure that this adapter was ever added */ mutex_lock(&core_lock); found = idr_find(&i2c_adapter_idr, adap->nr); mutex_unlock(&core_lock); if (found != adap) { pr_debug("attempting to delete unregistered adapter [%s]\n", adap->name); return; } i2c_acpi_remove_space_handler(adap); /* Tell drivers about this removal */ mutex_lock(&core_lock); bus_for_each_drv(&i2c_bus_type, NULL, adap, __process_removed_adapter); mutex_unlock(&core_lock); /* Remove devices instantiated from sysfs */ mutex_lock_nested(&adap->userspace_clients_lock, i2c_adapter_depth(adap)); list_for_each_entry_safe(client, next, &adap->userspace_clients, detected) { dev_dbg(&adap->dev, "Removing %s at 0x%x\n", client->name, client->addr); list_del(&client->detected); i2c_unregister_device(client); } mutex_unlock(&adap->userspace_clients_lock); /* Detach any active clients. This can't fail, thus we do not * check the returned value. This is a two-pass process, because * we can't remove the dummy devices during the first pass: they * could have been instantiated by real devices wishing to clean * them up properly, so we give them a chance to do that first. */ device_for_each_child(&adap->dev, NULL, __unregister_client); device_for_each_child(&adap->dev, NULL, __unregister_dummy); /* device name is gone after device_unregister */ dev_dbg(&adap->dev, "adapter [%s] unregistered\n", adap->name); pm_runtime_disable(&adap->dev); i2c_host_notify_irq_teardown(adap); debugfs_remove_recursive(adap->debugfs); /* wait until all references to the device are gone * * FIXME: This is old code and should ideally be replaced by an * alternative which results in decoupling the lifetime of the struct * device from the i2c_adapter, like spi or netdev do. Any solution * should be thoroughly tested with DEBUG_KOBJECT_RELEASE enabled! */ init_completion(&adap->dev_released); device_unregister(&adap->dev); wait_for_completion(&adap->dev_released); /* free bus id */ mutex_lock(&core_lock); idr_remove(&i2c_adapter_idr, adap->nr); mutex_unlock(&core_lock); /* Clear the device structure in case this adapter is ever going to be added again */ memset(&adap->dev, 0, sizeof(adap->dev)); } EXPORT_SYMBOL(i2c_del_adapter); static void devm_i2c_del_adapter(void *adapter) { i2c_del_adapter(adapter); } /** * devm_i2c_add_adapter - device-managed variant of i2c_add_adapter() * @dev: managing device for adding this I2C adapter * @adapter: the adapter to add * Context: can sleep * * Add adapter with dynamic bus number, same with i2c_add_adapter() * but the adapter will be auto deleted on driver detach. */ int devm_i2c_add_adapter(struct device *dev, struct i2c_adapter *adapter) { int ret; ret = i2c_add_adapter(adapter); if (ret) return ret; return devm_add_action_or_reset(dev, devm_i2c_del_adapter, adapter); } EXPORT_SYMBOL_GPL(devm_i2c_add_adapter); static int i2c_dev_or_parent_fwnode_match(struct device *dev, const void *data) { if (dev_fwnode(dev) == data) return 1; if (dev->parent && dev_fwnode(dev->parent) == data) return 1; return 0; } /** * i2c_find_adapter_by_fwnode() - find an i2c_adapter for the fwnode * @fwnode: &struct fwnode_handle corresponding to the &struct i2c_adapter * * Look up and return the &struct i2c_adapter corresponding to the @fwnode. * If no adapter can be found, or @fwnode is NULL, this returns NULL. * * The user must call put_device(&adapter->dev) once done with the i2c adapter. */ struct i2c_adapter *i2c_find_adapter_by_fwnode(struct fwnode_handle *fwnode) { struct i2c_adapter *adapter; struct device *dev; if (!fwnode) return NULL; dev = bus_find_device(&i2c_bus_type, NULL, fwnode, i2c_dev_or_parent_fwnode_match); if (!dev) return NULL; adapter = i2c_verify_adapter(dev); if (!adapter) put_device(dev); return adapter; } EXPORT_SYMBOL(i2c_find_adapter_by_fwnode); /** * i2c_get_adapter_by_fwnode() - find an i2c_adapter for the fwnode * @fwnode: &struct fwnode_handle corresponding to the &struct i2c_adapter * * Look up and return the &struct i2c_adapter corresponding to the @fwnode, * and increment the adapter module's use count. If no adapter can be found, * or @fwnode is NULL, this returns NULL. * * The user must call i2c_put_adapter(adapter) once done with the i2c adapter. * Note that this is different from i2c_find_adapter_by_node(). */ struct i2c_adapter *i2c_get_adapter_by_fwnode(struct fwnode_handle *fwnode) { struct i2c_adapter *adapter; adapter = i2c_find_adapter_by_fwnode(fwnode); if (!adapter) return NULL; if (!try_module_get(adapter->owner)) { put_device(&adapter->dev); adapter = NULL; } return adapter; } EXPORT_SYMBOL(i2c_get_adapter_by_fwnode); static void i2c_parse_timing(struct device *dev, char *prop_name, u32 *cur_val_p, u32 def_val, bool use_def) { int ret; ret = device_property_read_u32(dev, prop_name, cur_val_p); if (ret && use_def) *cur_val_p = def_val; dev_dbg(dev, "%s: %u\n", prop_name, *cur_val_p); } /** * i2c_parse_fw_timings - get I2C related timing parameters from firmware * @dev: The device to scan for I2C timing properties * @t: the i2c_timings struct to be filled with values * @use_defaults: bool to use sane defaults derived from the I2C specification * when properties are not found, otherwise don't update * * Scan the device for the generic I2C properties describing timing parameters * for the signal and fill the given struct with the results. If a property was * not found and use_defaults was true, then maximum timings are assumed which * are derived from the I2C specification. If use_defaults is not used, the * results will be as before, so drivers can apply their own defaults before * calling this helper. The latter is mainly intended for avoiding regressions * of existing drivers which want to switch to this function. New drivers * almost always should use the defaults. */ void i2c_parse_fw_timings(struct device *dev, struct i2c_timings *t, bool use_defaults) { bool u = use_defaults; u32 d; i2c_parse_timing(dev, "clock-frequency", &t->bus_freq_hz, I2C_MAX_STANDARD_MODE_FREQ, u); d = t->bus_freq_hz <= I2C_MAX_STANDARD_MODE_FREQ ? 1000 : t->bus_freq_hz <= I2C_MAX_FAST_MODE_FREQ ? 300 : 120; i2c_parse_timing(dev, "i2c-scl-rising-time-ns", &t->scl_rise_ns, d, u); d = t->bus_freq_hz <= I2C_MAX_FAST_MODE_FREQ ? 300 : 120; i2c_parse_timing(dev, "i2c-scl-falling-time-ns", &t->scl_fall_ns, d, u); i2c_parse_timing(dev, "i2c-scl-internal-delay-ns", &t->scl_int_delay_ns, 0, u); i2c_parse_timing(dev, "i2c-sda-falling-time-ns", &t->sda_fall_ns, t->scl_fall_ns, u); i2c_parse_timing(dev, "i2c-sda-hold-time-ns", &t->sda_hold_ns, 0, u); i2c_parse_timing(dev, "i2c-digital-filter-width-ns", &t->digital_filter_width_ns, 0, u); i2c_parse_timing(dev, "i2c-analog-filter-cutoff-frequency", &t->analog_filter_cutoff_freq_hz, 0, u); } EXPORT_SYMBOL_GPL(i2c_parse_fw_timings); /* ------------------------------------------------------------------------- */ int i2c_for_each_dev(void *data, int (*fn)(struct device *dev, void *data)) { int res; mutex_lock(&core_lock); res = bus_for_each_dev(&i2c_bus_type, NULL, data, fn); mutex_unlock(&core_lock); return res; } EXPORT_SYMBOL_GPL(i2c_for_each_dev); static int __process_new_driver(struct device *dev, void *data) { if (dev->type != &i2c_adapter_type) return 0; return i2c_do_add_adapter(data, to_i2c_adapter(dev)); } /* * An i2c_driver is used with one or more i2c_client (device) nodes to access * i2c slave chips, on a bus instance associated with some i2c_adapter. */ int i2c_register_driver(struct module *owner, struct i2c_driver *driver) { int res; /* Can't register until after driver model init */ if (WARN_ON(!is_registered)) return -EAGAIN; /* add the driver to the list of i2c drivers in the driver core */ driver->driver.owner = owner; driver->driver.bus = &i2c_bus_type; INIT_LIST_HEAD(&driver->clients); /* When registration returns, the driver core * will have called probe() for all matching-but-unbound devices. */ res = driver_register(&driver->driver); if (res) return res; pr_debug("driver [%s] registered\n", driver->driver.name); /* Walk the adapters that are already present */ i2c_for_each_dev(driver, __process_new_driver); return 0; } EXPORT_SYMBOL(i2c_register_driver); static int __process_removed_driver(struct device *dev, void *data) { if (dev->type == &i2c_adapter_type) i2c_do_del_adapter(data, to_i2c_adapter(dev)); return 0; } /** * i2c_del_driver - unregister I2C driver * @driver: the driver being unregistered * Context: can sleep */ void i2c_del_driver(struct i2c_driver *driver) { i2c_for_each_dev(driver, __process_removed_driver); driver_unregister(&driver->driver); pr_debug("driver [%s] unregistered\n", driver->driver.name); } EXPORT_SYMBOL(i2c_del_driver); /* ------------------------------------------------------------------------- */ struct i2c_cmd_arg { unsigned cmd; void *arg; }; static int i2c_cmd(struct device *dev, void *_arg) { struct i2c_client *client = i2c_verify_client(dev); struct i2c_cmd_arg *arg = _arg; struct i2c_driver *driver; if (!client || !client->dev.driver) return 0; driver = to_i2c_driver(client->dev.driver); if (driver->command) driver->command(client, arg->cmd, arg->arg); return 0; } void i2c_clients_command(struct i2c_adapter *adap, unsigned int cmd, void *arg) { struct i2c_cmd_arg cmd_arg; cmd_arg.cmd = cmd; cmd_arg.arg = arg; device_for_each_child(&adap->dev, &cmd_arg, i2c_cmd); } EXPORT_SYMBOL(i2c_clients_command); static int __init i2c_init(void) { int retval; retval = of_alias_get_highest_id("i2c"); down_write(&__i2c_board_lock); if (retval >= __i2c_first_dynamic_bus_num) __i2c_first_dynamic_bus_num = retval + 1; up_write(&__i2c_board_lock); retval = bus_register(&i2c_bus_type); if (retval) return retval; is_registered = true; i2c_debugfs_root = debugfs_create_dir("i2c", NULL); retval = i2c_add_driver(&dummy_driver); if (retval) goto class_err; if (IS_ENABLED(CONFIG_OF_DYNAMIC)) WARN_ON(of_reconfig_notifier_register(&i2c_of_notifier)); if (IS_ENABLED(CONFIG_ACPI)) WARN_ON(acpi_reconfig_notifier_register(&i2c_acpi_notifier)); return 0; class_err: is_registered = false; bus_unregister(&i2c_bus_type); return retval; } static void __exit i2c_exit(void) { if (IS_ENABLED(CONFIG_ACPI)) WARN_ON(acpi_reconfig_notifier_unregister(&i2c_acpi_notifier)); if (IS_ENABLED(CONFIG_OF_DYNAMIC)) WARN_ON(of_reconfig_notifier_unregister(&i2c_of_notifier)); i2c_del_driver(&dummy_driver); debugfs_remove_recursive(i2c_debugfs_root); bus_unregister(&i2c_bus_type); tracepoint_synchronize_unregister(); } /* We must initialize early, because some subsystems register i2c drivers * in subsys_initcall() code, but are linked (and initialized) before i2c. */ postcore_initcall(i2c_init); module_exit(i2c_exit); /* ---------------------------------------------------- * the functional interface to the i2c busses. * ---------------------------------------------------- */ /* Check if val is exceeding the quirk IFF quirk is non 0 */ #define i2c_quirk_exceeded(val, quirk) ((quirk) && ((val) > (quirk))) static int i2c_quirk_error(struct i2c_adapter *adap, struct i2c_msg *msg, char *err_msg) { dev_err_ratelimited(&adap->dev, "adapter quirk: %s (addr 0x%04x, size %u, %s)\n", err_msg, msg->addr, msg->len, str_read_write(msg->flags & I2C_M_RD)); return -EOPNOTSUPP; } static int i2c_check_for_quirks(struct i2c_adapter *adap, struct i2c_msg *msgs, int num) { const struct i2c_adapter_quirks *q = adap->quirks; int max_num = q->max_num_msgs, i; bool do_len_check = true; if (q->flags & I2C_AQ_COMB) { max_num = 2; /* special checks for combined messages */ if (num == 2) { if (q->flags & I2C_AQ_COMB_WRITE_FIRST && msgs[0].flags & I2C_M_RD) return i2c_quirk_error(adap, &msgs[0], "1st comb msg must be write"); if (q->flags & I2C_AQ_COMB_READ_SECOND && !(msgs[1].flags & I2C_M_RD)) return i2c_quirk_error(adap, &msgs[1], "2nd comb msg must be read"); if (q->flags & I2C_AQ_COMB_SAME_ADDR && msgs[0].addr != msgs[1].addr) return i2c_quirk_error(adap, &msgs[0], "comb msg only to same addr"); if (i2c_quirk_exceeded(msgs[0].len, q->max_comb_1st_msg_len)) return i2c_quirk_error(adap, &msgs[0], "msg too long"); if (i2c_quirk_exceeded(msgs[1].len, q->max_comb_2nd_msg_len)) return i2c_quirk_error(adap, &msgs[1], "msg too long"); do_len_check = false; } } if (i2c_quirk_exceeded(num, max_num)) return i2c_quirk_error(adap, &msgs[0], "too many messages"); for (i = 0; i < num; i++) { u16 len = msgs[i].len; if (msgs[i].flags & I2C_M_RD) { if (do_len_check && i2c_quirk_exceeded(len, q->max_read_len)) return i2c_quirk_error(adap, &msgs[i], "msg too long"); if (q->flags & I2C_AQ_NO_ZERO_LEN_READ && len == 0) return i2c_quirk_error(adap, &msgs[i], "no zero length"); } else { if (do_len_check && i2c_quirk_exceeded(len, q->max_write_len)) return i2c_quirk_error(adap, &msgs[i], "msg too long"); if (q->flags & I2C_AQ_NO_ZERO_LEN_WRITE && len == 0) return i2c_quirk_error(adap, &msgs[i], "no zero length"); } } return 0; } /** * __i2c_transfer - unlocked flavor of i2c_transfer * @adap: Handle to I2C bus * @msgs: One or more messages to execute before STOP is issued to * terminate the operation; each message begins with a START. * @num: Number of messages to be executed. * * Returns negative errno, else the number of messages executed. * * Adapter lock must be held when calling this function. No debug logging * takes place. */ int __i2c_transfer(struct i2c_adapter *adap, struct i2c_msg *msgs, int num) { unsigned long orig_jiffies; int ret, try; if (!adap->algo->master_xfer) { dev_dbg(&adap->dev, "I2C level transfers not supported\n"); return -EOPNOTSUPP; } if (WARN_ON(!msgs || num < 1)) return -EINVAL; ret = __i2c_check_suspended(adap); if (ret) return ret; if (adap->quirks && i2c_check_for_quirks(adap, msgs, num)) return -EOPNOTSUPP; /* * i2c_trace_msg_key gets enabled when tracepoint i2c_transfer gets * enabled. This is an efficient way of keeping the for-loop from * being executed when not needed. */ if (static_branch_unlikely(&i2c_trace_msg_key)) { int i; for (i = 0; i < num; i++) if (msgs[i].flags & I2C_M_RD) trace_i2c_read(adap, &msgs[i], i); else trace_i2c_write(adap, &msgs[i], i); } /* Retry automatically on arbitration loss */ orig_jiffies = jiffies; for (ret = 0, try = 0; try <= adap->retries; try++) { if (i2c_in_atomic_xfer_mode() && adap->algo->master_xfer_atomic) ret = adap->algo->master_xfer_atomic(adap, msgs, num); else ret = adap->algo->master_xfer(adap, msgs, num); if (ret != -EAGAIN) break; if (time_after(jiffies, orig_jiffies + adap->timeout)) break; } if (static_branch_unlikely(&i2c_trace_msg_key)) { int i; for (i = 0; i < ret; i++) if (msgs[i].flags & I2C_M_RD) trace_i2c_reply(adap, &msgs[i], i); trace_i2c_result(adap, num, ret); } return ret; } EXPORT_SYMBOL(__i2c_transfer); /** * i2c_transfer - execute a single or combined I2C message * @adap: Handle to I2C bus * @msgs: One or more messages to execute before STOP is issued to * terminate the operation; each message begins with a START. * @num: Number of messages to be executed. * * Returns negative errno, else the number of messages executed. * * Note that there is no requirement that each message be sent to * the same slave address, although that is the most common model. */ int i2c_transfer(struct i2c_adapter *adap, struct i2c_msg *msgs, int num) { int ret; /* REVISIT the fault reporting model here is weak: * * - When we get an error after receiving N bytes from a slave, * there is no way to report "N". * * - When we get a NAK after transmitting N bytes to a slave, * there is no way to report "N" ... or to let the master * continue executing the rest of this combined message, if * that's the appropriate response. * * - When for example "num" is two and we successfully complete * the first message but get an error part way through the * second, it's unclear whether that should be reported as * one (discarding status on the second message) or errno * (discarding status on the first one). */ ret = __i2c_lock_bus_helper(adap); if (ret) return ret; ret = __i2c_transfer(adap, msgs, num); i2c_unlock_bus(adap, I2C_LOCK_SEGMENT); return ret; } EXPORT_SYMBOL(i2c_transfer); /** * i2c_transfer_buffer_flags - issue a single I2C message transferring data * to/from a buffer * @client: Handle to slave device * @buf: Where the data is stored * @count: How many bytes to transfer, must be less than 64k since msg.len is u16 * @flags: The flags to be used for the message, e.g. I2C_M_RD for reads * * Returns negative errno, or else the number of bytes transferred. */ int i2c_transfer_buffer_flags(const struct i2c_client *client, char *buf, int count, u16 flags) { int ret; struct i2c_msg msg = { .addr = client->addr, .flags = flags | (client->flags & I2C_M_TEN), .len = count, .buf = buf, }; ret = i2c_transfer(client->adapter, &msg, 1); /* * If everything went ok (i.e. 1 msg transferred), return #bytes * transferred, else error code. */ return (ret == 1) ? count : ret; } EXPORT_SYMBOL(i2c_transfer_buffer_flags); /** * i2c_get_device_id - get manufacturer, part id and die revision of a device * @client: The device to query * @id: The queried information * * Returns negative errno on error, zero on success. */ int i2c_get_device_id(const struct i2c_client *client, struct i2c_device_identity *id) { struct i2c_adapter *adap = client->adapter; union i2c_smbus_data raw_id; int ret; if (!i2c_check_functionality(adap, I2C_FUNC_SMBUS_READ_I2C_BLOCK)) return -EOPNOTSUPP; raw_id.block[0] = 3; ret = i2c_smbus_xfer(adap, I2C_ADDR_DEVICE_ID, 0, I2C_SMBUS_READ, client->addr << 1, I2C_SMBUS_I2C_BLOCK_DATA, &raw_id); if (ret) return ret; id->manufacturer_id = (raw_id.block[1] << 4) | (raw_id.block[2] >> 4); id->part_id = ((raw_id.block[2] & 0xf) << 5) | (raw_id.block[3] >> 3); id->die_revision = raw_id.block[3] & 0x7; return 0; } EXPORT_SYMBOL_GPL(i2c_get_device_id); /** * i2c_client_get_device_id - get the driver match table entry of a device * @client: the device to query. The device must be bound to a driver * * Returns a pointer to the matching entry if found, NULL otherwise. */ const struct i2c_device_id *i2c_client_get_device_id(const struct i2c_client *client) { const struct i2c_driver *drv = to_i2c_driver(client->dev.driver); return i2c_match_id(drv->id_table, client); } EXPORT_SYMBOL_GPL(i2c_client_get_device_id); /* ---------------------------------------------------- * the i2c address scanning function * Will not work for 10-bit addresses! * ---------------------------------------------------- */ /* * Legacy default probe function, mostly relevant for SMBus. The default * probe method is a quick write, but it is known to corrupt the 24RF08 * EEPROMs due to a state machine bug, and could also irreversibly * write-protect some EEPROMs, so for address ranges 0x30-0x37 and 0x50-0x5f, * we use a short byte read instead. Also, some bus drivers don't implement * quick write, so we fallback to a byte read in that case too. * On x86, there is another special case for FSC hardware monitoring chips, * which want regular byte reads (address 0x73.) Fortunately, these are the * only known chips using this I2C address on PC hardware. * Returns 1 if probe succeeded, 0 if not. */ static int i2c_default_probe(struct i2c_adapter *adap, unsigned short addr) { int err; union i2c_smbus_data dummy; #ifdef CONFIG_X86 if (addr == 0x73 && (adap->class & I2C_CLASS_HWMON) && i2c_check_functionality(adap, I2C_FUNC_SMBUS_READ_BYTE_DATA)) err = i2c_smbus_xfer(adap, addr, 0, I2C_SMBUS_READ, 0, I2C_SMBUS_BYTE_DATA, &dummy); else #endif if (!((addr & ~0x07) == 0x30 || (addr & ~0x0f) == 0x50) && i2c_check_functionality(adap, I2C_FUNC_SMBUS_QUICK)) err = i2c_smbus_xfer(adap, addr, 0, I2C_SMBUS_WRITE, 0, I2C_SMBUS_QUICK, NULL); else if (i2c_check_functionality(adap, I2C_FUNC_SMBUS_READ_BYTE)) err = i2c_smbus_xfer(adap, addr, 0, I2C_SMBUS_READ, 0, I2C_SMBUS_BYTE, &dummy); else { dev_warn(&adap->dev, "No suitable probing method supported for address 0x%02X\n", addr); err = -EOPNOTSUPP; } return err >= 0; } static int i2c_detect_address(struct i2c_client *temp_client, struct i2c_driver *driver) { struct i2c_board_info info; struct i2c_adapter *adapter = temp_client->adapter; int addr = temp_client->addr; int err; /* Make sure the address is valid */ err = i2c_check_7bit_addr_validity_strict(addr); if (err) { dev_warn(&adapter->dev, "Invalid probe address 0x%02x\n", addr); return err; } /* Skip if already in use (7 bit, no need to encode flags) */ if (i2c_check_addr_busy(adapter, addr)) return 0; /* Make sure there is something at this address */ if (!i2c_default_probe(adapter, addr)) return 0; /* Finally call the custom detection function */ memset(&info, 0, sizeof(struct i2c_board_info)); info.addr = addr; err = driver->detect(temp_client, &info); if (err) { /* -ENODEV is returned if the detection fails. We catch it here as this isn't an error. */ return err == -ENODEV ? 0 : err; } /* Consistency check */ if (info.type[0] == '\0') { dev_err(&adapter->dev, "%s detection function provided no name for 0x%x\n", driver->driver.name, addr); } else { struct i2c_client *client; /* Detection succeeded, instantiate the device */ if (adapter->class & I2C_CLASS_DEPRECATED) dev_warn(&adapter->dev, "This adapter will soon drop class based instantiation of devices. " "Please make sure client 0x%02x gets instantiated by other means. " "Check 'Documentation/i2c/instantiating-devices.rst' for details.\n", info.addr); dev_dbg(&adapter->dev, "Creating %s at 0x%02x\n", info.type, info.addr); client = i2c_new_client_device(adapter, &info); if (!IS_ERR(client)) list_add_tail(&client->detected, &driver->clients); else dev_err(&adapter->dev, "Failed creating %s at 0x%02x\n", info.type, info.addr); } return 0; } static int i2c_detect(struct i2c_adapter *adapter, struct i2c_driver *driver) { const unsigned short *address_list; struct i2c_client *temp_client; int i, err = 0; address_list = driver->address_list; if (!driver->detect || !address_list) return 0; /* Warn that the adapter lost class based instantiation */ if (adapter->class == I2C_CLASS_DEPRECATED) { dev_dbg(&adapter->dev, "This adapter dropped support for I2C classes and won't auto-detect %s devices anymore. " "If you need it, check 'Documentation/i2c/instantiating-devices.rst' for alternatives.\n", driver->driver.name); return 0; } /* Stop here if the classes do not match */ if (!(adapter->class & driver->class)) return 0; /* Set up a temporary client to help detect callback */ temp_client = kzalloc(sizeof(*temp_client), GFP_KERNEL); if (!temp_client) return -ENOMEM; temp_client->adapter = adapter; for (i = 0; address_list[i] != I2C_CLIENT_END; i += 1) { dev_dbg(&adapter->dev, "found normal entry for adapter %d, addr 0x%02x\n", i2c_adapter_id(adapter), address_list[i]); temp_client->addr = address_list[i]; err = i2c_detect_address(temp_client, driver); if (unlikely(err)) break; } kfree(temp_client); return err; } int i2c_probe_func_quick_read(struct i2c_adapter *adap, unsigned short addr) { return i2c_smbus_xfer(adap, addr, 0, I2C_SMBUS_READ, 0, I2C_SMBUS_QUICK, NULL) >= 0; } EXPORT_SYMBOL_GPL(i2c_probe_func_quick_read); struct i2c_client * i2c_new_scanned_device(struct i2c_adapter *adap, struct i2c_board_info *info, unsigned short const *addr_list, int (*probe)(struct i2c_adapter *adap, unsigned short addr)) { int i; if (!probe) probe = i2c_default_probe; for (i = 0; addr_list[i] != I2C_CLIENT_END; i++) { /* Check address validity */ if (i2c_check_7bit_addr_validity_strict(addr_list[i]) < 0) { dev_warn(&adap->dev, "Invalid 7-bit address 0x%02x\n", addr_list[i]); continue; } /* Check address availability (7 bit, no need to encode flags) */ if (i2c_check_addr_busy(adap, addr_list[i])) { dev_dbg(&adap->dev, "Address 0x%02x already in use, not probing\n", addr_list[i]); continue; } /* Test address responsiveness */ if (probe(adap, addr_list[i])) break; } if (addr_list[i] == I2C_CLIENT_END) { dev_dbg(&adap->dev, "Probing failed, no device found\n"); return ERR_PTR(-ENODEV); } info->addr = addr_list[i]; return i2c_new_client_device(adap, info); } EXPORT_SYMBOL_GPL(i2c_new_scanned_device); struct i2c_adapter *i2c_get_adapter(int nr) { struct i2c_adapter *adapter; mutex_lock(&core_lock); adapter = idr_find(&i2c_adapter_idr, nr); if (!adapter) goto exit; if (try_module_get(adapter->owner)) get_device(&adapter->dev); else adapter = NULL; exit: mutex_unlock(&core_lock); return adapter; } EXPORT_SYMBOL(i2c_get_adapter); void i2c_put_adapter(struct i2c_adapter *adap) { if (!adap) return; module_put(adap->owner); /* Should be last, otherwise we risk use-after-free with 'adap' */ put_device(&adap->dev); } EXPORT_SYMBOL(i2c_put_adapter); /** * i2c_get_dma_safe_msg_buf() - get a DMA safe buffer for the given i2c_msg * @msg: the message to be checked * @threshold: the minimum number of bytes for which using DMA makes sense. * Should at least be 1. * * Return: NULL if a DMA safe buffer was not obtained. Use msg->buf with PIO. * Or a valid pointer to be used with DMA. After use, release it by * calling i2c_put_dma_safe_msg_buf(). * * This function must only be called from process context! */ u8 *i2c_get_dma_safe_msg_buf(struct i2c_msg *msg, unsigned int threshold) { /* also skip 0-length msgs for bogus thresholds of 0 */ if (!threshold) pr_debug("DMA buffer for addr=0x%02x with length 0 is bogus\n", msg->addr); if (msg->len < threshold || msg->len == 0) return NULL; if (msg->flags & I2C_M_DMA_SAFE) return msg->buf; pr_debug("using bounce buffer for addr=0x%02x, len=%d\n", msg->addr, msg->len); if (msg->flags & I2C_M_RD) return kzalloc(msg->len, GFP_KERNEL); else return kmemdup(msg->buf, msg->len, GFP_KERNEL); } EXPORT_SYMBOL_GPL(i2c_get_dma_safe_msg_buf); /** * i2c_put_dma_safe_msg_buf - release DMA safe buffer and sync with i2c_msg * @buf: the buffer obtained from i2c_get_dma_safe_msg_buf(). May be NULL. * @msg: the message which the buffer corresponds to * @xferred: bool saying if the message was transferred */ void i2c_put_dma_safe_msg_buf(u8 *buf, struct i2c_msg *msg, bool xferred) { if (!buf || buf == msg->buf) return; if (xferred && msg->flags & I2C_M_RD) memcpy(msg->buf, buf, msg->len); kfree(buf); } EXPORT_SYMBOL_GPL(i2c_put_dma_safe_msg_buf); MODULE_AUTHOR("Simon G. Vogl <simon@tk.uni-linz.ac.at>"); MODULE_DESCRIPTION("I2C-Bus main module"); MODULE_LICENSE("GPL");
4 27 27 4 4 4 26 26 4 22 26 22 4 4 4 28 27 28 28 28 28 28 6 26 4 4 18 17 17 41 1 1 3 2 6 27 2 18 3 9 4 27 21 9 2 5 1 1 2 2 38 27 1 2 1 1 6 11 11 7 4 11 7 1 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2015, Sony Mobile Communications Inc. * Copyright (c) 2013, The Linux Foundation. All rights reserved. */ #include <linux/module.h> #include <linux/netlink.h> #include <linux/qrtr.h> #include <linux/termios.h> /* For TIOCINQ/OUTQ */ #include <linux/spinlock.h> #include <linux/wait.h> #include <net/sock.h> #include "qrtr.h" #define QRTR_PROTO_VER_1 1 #define QRTR_PROTO_VER_2 3 /* auto-bind range */ #define QRTR_MIN_EPH_SOCKET 0x4000 #define QRTR_MAX_EPH_SOCKET 0x7fff #define QRTR_EPH_PORT_RANGE \ XA_LIMIT(QRTR_MIN_EPH_SOCKET, QRTR_MAX_EPH_SOCKET) #define QRTR_PORT_CTRL_LEGACY 0xffff /** * struct qrtr_hdr_v1 - (I|R)PCrouter packet header version 1 * @version: protocol version * @type: packet type; one of QRTR_TYPE_* * @src_node_id: source node * @src_port_id: source port * @confirm_rx: boolean; whether a resume-tx packet should be send in reply * @size: length of packet, excluding this header * @dst_node_id: destination node * @dst_port_id: destination port */ struct qrtr_hdr_v1 { __le32 version; __le32 type; __le32 src_node_id; __le32 src_port_id; __le32 confirm_rx; __le32 size; __le32 dst_node_id; __le32 dst_port_id; } __packed; /** * struct qrtr_hdr_v2 - (I|R)PCrouter packet header later versions * @version: protocol version * @type: packet type; one of QRTR_TYPE_* * @flags: bitmask of QRTR_FLAGS_* * @optlen: length of optional header data * @size: length of packet, excluding this header and optlen * @src_node_id: source node * @src_port_id: source port * @dst_node_id: destination node * @dst_port_id: destination port */ struct qrtr_hdr_v2 { u8 version; u8 type; u8 flags; u8 optlen; __le32 size; __le16 src_node_id; __le16 src_port_id; __le16 dst_node_id; __le16 dst_port_id; }; #define QRTR_FLAGS_CONFIRM_RX BIT(0) struct qrtr_cb { u32 src_node; u32 src_port; u32 dst_node; u32 dst_port; u8 type; u8 confirm_rx; }; #define QRTR_HDR_MAX_SIZE max_t(size_t, sizeof(struct qrtr_hdr_v1), \ sizeof(struct qrtr_hdr_v2)) struct qrtr_sock { /* WARNING: sk must be the first member */ struct sock sk; struct sockaddr_qrtr us; struct sockaddr_qrtr peer; }; static inline struct qrtr_sock *qrtr_sk(struct sock *sk) { BUILD_BUG_ON(offsetof(struct qrtr_sock, sk) != 0); return container_of(sk, struct qrtr_sock, sk); } static unsigned int qrtr_local_nid = 1; /* for node ids */ static RADIX_TREE(qrtr_nodes, GFP_ATOMIC); static DEFINE_SPINLOCK(qrtr_nodes_lock); /* broadcast list */ static LIST_HEAD(qrtr_all_nodes); /* lock for qrtr_all_nodes and node reference */ static DEFINE_MUTEX(qrtr_node_lock); /* local port allocation management */ static DEFINE_XARRAY_ALLOC(qrtr_ports); /** * struct qrtr_node - endpoint node * @ep_lock: lock for endpoint management and callbacks * @ep: endpoint * @ref: reference count for node * @nid: node id * @qrtr_tx_flow: tree of qrtr_tx_flow, keyed by node << 32 | port * @qrtr_tx_lock: lock for qrtr_tx_flow inserts * @rx_queue: receive queue * @item: list item for broadcast list */ struct qrtr_node { struct mutex ep_lock; struct qrtr_endpoint *ep; struct kref ref; unsigned int nid; struct radix_tree_root qrtr_tx_flow; struct mutex qrtr_tx_lock; /* for qrtr_tx_flow */ struct sk_buff_head rx_queue; struct list_head item; }; /** * struct qrtr_tx_flow - tx flow control * @resume_tx: waiters for a resume tx from the remote * @pending: number of waiting senders * @tx_failed: indicates that a message with confirm_rx flag was lost */ struct qrtr_tx_flow { struct wait_queue_head resume_tx; int pending; int tx_failed; }; #define QRTR_TX_FLOW_HIGH 10 #define QRTR_TX_FLOW_LOW 5 static int qrtr_local_enqueue(struct qrtr_node *node, struct sk_buff *skb, int type, struct sockaddr_qrtr *from, struct sockaddr_qrtr *to); static int qrtr_bcast_enqueue(struct qrtr_node *node, struct sk_buff *skb, int type, struct sockaddr_qrtr *from, struct sockaddr_qrtr *to); static struct qrtr_sock *qrtr_port_lookup(int port); static void qrtr_port_put(struct qrtr_sock *ipc); /* Release node resources and free the node. * * Do not call directly, use qrtr_node_release. To be used with * kref_put_mutex. As such, the node mutex is expected to be locked on call. */ static void __qrtr_node_release(struct kref *kref) { struct qrtr_node *node = container_of(kref, struct qrtr_node, ref); struct radix_tree_iter iter; struct qrtr_tx_flow *flow; unsigned long flags; void __rcu **slot; spin_lock_irqsave(&qrtr_nodes_lock, flags); /* If the node is a bridge for other nodes, there are possibly * multiple entries pointing to our released node, delete them all. */ radix_tree_for_each_slot(slot, &qrtr_nodes, &iter, 0) { if (*slot == node) radix_tree_iter_delete(&qrtr_nodes, &iter, slot); } spin_unlock_irqrestore(&qrtr_nodes_lock, flags); list_del(&node->item); mutex_unlock(&qrtr_node_lock); skb_queue_purge(&node->rx_queue); /* Free tx flow counters */ radix_tree_for_each_slot(slot, &node->qrtr_tx_flow, &iter, 0) { flow = *slot; radix_tree_iter_delete(&node->qrtr_tx_flow, &iter, slot); kfree(flow); } kfree(node); } /* Increment reference to node. */ static struct qrtr_node *qrtr_node_acquire(struct qrtr_node *node) { if (node) kref_get(&node->ref); return node; } /* Decrement reference to node and release as necessary. */ static void qrtr_node_release(struct qrtr_node *node) { if (!node) return; kref_put_mutex(&node->ref, __qrtr_node_release, &qrtr_node_lock); } /** * qrtr_tx_resume() - reset flow control counter * @node: qrtr_node that the QRTR_TYPE_RESUME_TX packet arrived on * @skb: resume_tx packet */ static void qrtr_tx_resume(struct qrtr_node *node, struct sk_buff *skb) { struct qrtr_ctrl_pkt *pkt = (struct qrtr_ctrl_pkt *)skb->data; u64 remote_node = le32_to_cpu(pkt->client.node); u32 remote_port = le32_to_cpu(pkt->client.port); struct qrtr_tx_flow *flow; unsigned long key; key = remote_node << 32 | remote_port; rcu_read_lock(); flow = radix_tree_lookup(&node->qrtr_tx_flow, key); rcu_read_unlock(); if (flow) { spin_lock(&flow->resume_tx.lock); flow->pending = 0; spin_unlock(&flow->resume_tx.lock); wake_up_interruptible_all(&flow->resume_tx); } consume_skb(skb); } /** * qrtr_tx_wait() - flow control for outgoing packets * @node: qrtr_node that the packet is to be send to * @dest_node: node id of the destination * @dest_port: port number of the destination * @type: type of message * * The flow control scheme is based around the low and high "watermarks". When * the low watermark is passed the confirm_rx flag is set on the outgoing * message, which will trigger the remote to send a control message of the type * QRTR_TYPE_RESUME_TX to reset the counter. If the high watermark is hit * further transmision should be paused. * * Return: 1 if confirm_rx should be set, 0 otherwise or errno failure */ static int qrtr_tx_wait(struct qrtr_node *node, int dest_node, int dest_port, int type) { unsigned long key = (u64)dest_node << 32 | dest_port; struct qrtr_tx_flow *flow; int confirm_rx = 0; int ret; /* Never set confirm_rx on non-data packets */ if (type != QRTR_TYPE_DATA) return 0; mutex_lock(&node->qrtr_tx_lock); flow = radix_tree_lookup(&node->qrtr_tx_flow, key); if (!flow) { flow = kzalloc(sizeof(*flow), GFP_KERNEL); if (flow) { init_waitqueue_head(&flow->resume_tx); if (radix_tree_insert(&node->qrtr_tx_flow, key, flow)) { kfree(flow); flow = NULL; } } } mutex_unlock(&node->qrtr_tx_lock); /* Set confirm_rx if we where unable to find and allocate a flow */ if (!flow) return 1; spin_lock_irq(&flow->resume_tx.lock); ret = wait_event_interruptible_locked_irq(flow->resume_tx, flow->pending < QRTR_TX_FLOW_HIGH || flow->tx_failed || !node->ep); if (ret < 0) { confirm_rx = ret; } else if (!node->ep) { confirm_rx = -EPIPE; } else if (flow->tx_failed) { flow->tx_failed = 0; confirm_rx = 1; } else { flow->pending++; confirm_rx = flow->pending == QRTR_TX_FLOW_LOW; } spin_unlock_irq(&flow->resume_tx.lock); return confirm_rx; } /** * qrtr_tx_flow_failed() - flag that tx of confirm_rx flagged messages failed * @node: qrtr_node that the packet is to be send to * @dest_node: node id of the destination * @dest_port: port number of the destination * * Signal that the transmission of a message with confirm_rx flag failed. The * flow's "pending" counter will keep incrementing towards QRTR_TX_FLOW_HIGH, * at which point transmission would stall forever waiting for the resume TX * message associated with the dropped confirm_rx message. * Work around this by marking the flow as having a failed transmission and * cause the next transmission attempt to be sent with the confirm_rx. */ static void qrtr_tx_flow_failed(struct qrtr_node *node, int dest_node, int dest_port) { unsigned long key = (u64)dest_node << 32 | dest_port; struct qrtr_tx_flow *flow; rcu_read_lock(); flow = radix_tree_lookup(&node->qrtr_tx_flow, key); rcu_read_unlock(); if (flow) { spin_lock_irq(&flow->resume_tx.lock); flow->tx_failed = 1; spin_unlock_irq(&flow->resume_tx.lock); } } /* Pass an outgoing packet socket buffer to the endpoint driver. */ static int qrtr_node_enqueue(struct qrtr_node *node, struct sk_buff *skb, int type, struct sockaddr_qrtr *from, struct sockaddr_qrtr *to) { struct qrtr_hdr_v1 *hdr; size_t len = skb->len; int rc, confirm_rx; confirm_rx = qrtr_tx_wait(node, to->sq_node, to->sq_port, type); if (confirm_rx < 0) { kfree_skb(skb); return confirm_rx; } hdr = skb_push(skb, sizeof(*hdr)); hdr->version = cpu_to_le32(QRTR_PROTO_VER_1); hdr->type = cpu_to_le32(type); hdr->src_node_id = cpu_to_le32(from->sq_node); hdr->src_port_id = cpu_to_le32(from->sq_port); if (to->sq_port == QRTR_PORT_CTRL) { hdr->dst_node_id = cpu_to_le32(node->nid); hdr->dst_port_id = cpu_to_le32(QRTR_PORT_CTRL); } else { hdr->dst_node_id = cpu_to_le32(to->sq_node); hdr->dst_port_id = cpu_to_le32(to->sq_port); } hdr->size = cpu_to_le32(len); hdr->confirm_rx = !!confirm_rx; rc = skb_put_padto(skb, ALIGN(len, 4) + sizeof(*hdr)); if (!rc) { mutex_lock(&node->ep_lock); rc = -ENODEV; if (node->ep) rc = node->ep->xmit(node->ep, skb); else kfree_skb(skb); mutex_unlock(&node->ep_lock); } /* Need to ensure that a subsequent message carries the otherwise lost * confirm_rx flag if we dropped this one */ if (rc && confirm_rx) qrtr_tx_flow_failed(node, to->sq_node, to->sq_port); return rc; } /* Lookup node by id. * * callers must release with qrtr_node_release() */ static struct qrtr_node *qrtr_node_lookup(unsigned int nid) { struct qrtr_node *node; unsigned long flags; mutex_lock(&qrtr_node_lock); spin_lock_irqsave(&qrtr_nodes_lock, flags); node = radix_tree_lookup(&qrtr_nodes, nid); node = qrtr_node_acquire(node); spin_unlock_irqrestore(&qrtr_nodes_lock, flags); mutex_unlock(&qrtr_node_lock); return node; } /* Assign node id to node. * * This is mostly useful for automatic node id assignment, based on * the source id in the incoming packet. */ static void qrtr_node_assign(struct qrtr_node *node, unsigned int nid) { unsigned long flags; if (nid == QRTR_EP_NID_AUTO) return; spin_lock_irqsave(&qrtr_nodes_lock, flags); radix_tree_insert(&qrtr_nodes, nid, node); if (node->nid == QRTR_EP_NID_AUTO) node->nid = nid; spin_unlock_irqrestore(&qrtr_nodes_lock, flags); } /** * qrtr_endpoint_post() - post incoming data * @ep: endpoint handle * @data: data pointer * @len: size of data in bytes * * Return: 0 on success; negative error code on failure */ int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len) { struct qrtr_node *node = ep->node; const struct qrtr_hdr_v1 *v1; const struct qrtr_hdr_v2 *v2; struct qrtr_sock *ipc; struct sk_buff *skb; struct qrtr_cb *cb; size_t size; unsigned int ver; size_t hdrlen; if (len == 0 || len & 3) return -EINVAL; skb = __netdev_alloc_skb(NULL, len, GFP_ATOMIC | __GFP_NOWARN); if (!skb) return -ENOMEM; cb = (struct qrtr_cb *)skb->cb; /* Version field in v1 is little endian, so this works for both cases */ ver = *(u8*)data; switch (ver) { case QRTR_PROTO_VER_1: if (len < sizeof(*v1)) goto err; v1 = data; hdrlen = sizeof(*v1); cb->type = le32_to_cpu(v1->type); cb->src_node = le32_to_cpu(v1->src_node_id); cb->src_port = le32_to_cpu(v1->src_port_id); cb->confirm_rx = !!v1->confirm_rx; cb->dst_node = le32_to_cpu(v1->dst_node_id); cb->dst_port = le32_to_cpu(v1->dst_port_id); size = le32_to_cpu(v1->size); break; case QRTR_PROTO_VER_2: if (len < sizeof(*v2)) goto err; v2 = data; hdrlen = sizeof(*v2) + v2->optlen; cb->type = v2->type; cb->confirm_rx = !!(v2->flags & QRTR_FLAGS_CONFIRM_RX); cb->src_node = le16_to_cpu(v2->src_node_id); cb->src_port = le16_to_cpu(v2->src_port_id); cb->dst_node = le16_to_cpu(v2->dst_node_id); cb->dst_port = le16_to_cpu(v2->dst_port_id); if (cb->src_port == (u16)QRTR_PORT_CTRL) cb->src_port = QRTR_PORT_CTRL; if (cb->dst_port == (u16)QRTR_PORT_CTRL) cb->dst_port = QRTR_PORT_CTRL; size = le32_to_cpu(v2->size); break; default: pr_err("qrtr: Invalid version %d\n", ver); goto err; } if (cb->dst_port == QRTR_PORT_CTRL_LEGACY) cb->dst_port = QRTR_PORT_CTRL; if (!size || len != ALIGN(size, 4) + hdrlen) goto err; if ((cb->type == QRTR_TYPE_NEW_SERVER || cb->type == QRTR_TYPE_RESUME_TX) && size < sizeof(struct qrtr_ctrl_pkt)) goto err; if (cb->dst_port != QRTR_PORT_CTRL && cb->type != QRTR_TYPE_DATA && cb->type != QRTR_TYPE_RESUME_TX) goto err; skb_put_data(skb, data + hdrlen, size); qrtr_node_assign(node, cb->src_node); if (cb->type == QRTR_TYPE_NEW_SERVER) { /* Remote node endpoint can bridge other distant nodes */ const struct qrtr_ctrl_pkt *pkt; pkt = data + hdrlen; qrtr_node_assign(node, le32_to_cpu(pkt->server.node)); } if (cb->type == QRTR_TYPE_RESUME_TX) { qrtr_tx_resume(node, skb); } else { ipc = qrtr_port_lookup(cb->dst_port); if (!ipc) goto err; if (sock_queue_rcv_skb(&ipc->sk, skb)) { qrtr_port_put(ipc); goto err; } qrtr_port_put(ipc); } return 0; err: kfree_skb(skb); return -EINVAL; } EXPORT_SYMBOL_GPL(qrtr_endpoint_post); /** * qrtr_alloc_ctrl_packet() - allocate control packet skb * @pkt: reference to qrtr_ctrl_pkt pointer * @flags: the type of memory to allocate * * Returns newly allocated sk_buff, or NULL on failure * * This function allocates a sk_buff large enough to carry a qrtr_ctrl_pkt and * on success returns a reference to the control packet in @pkt. */ static struct sk_buff *qrtr_alloc_ctrl_packet(struct qrtr_ctrl_pkt **pkt, gfp_t flags) { const int pkt_len = sizeof(struct qrtr_ctrl_pkt); struct sk_buff *skb; skb = alloc_skb(QRTR_HDR_MAX_SIZE + pkt_len, flags); if (!skb) return NULL; skb_reserve(skb, QRTR_HDR_MAX_SIZE); *pkt = skb_put_zero(skb, pkt_len); return skb; } /** * qrtr_endpoint_register() - register a new endpoint * @ep: endpoint to register * @nid: desired node id; may be QRTR_EP_NID_AUTO for auto-assignment * Return: 0 on success; negative error code on failure * * The specified endpoint must have the xmit function pointer set on call. */ int qrtr_endpoint_register(struct qrtr_endpoint *ep, unsigned int nid) { struct qrtr_node *node; if (!ep || !ep->xmit) return -EINVAL; node = kzalloc(sizeof(*node), GFP_KERNEL); if (!node) return -ENOMEM; kref_init(&node->ref); mutex_init(&node->ep_lock); skb_queue_head_init(&node->rx_queue); node->nid = QRTR_EP_NID_AUTO; node->ep = ep; INIT_RADIX_TREE(&node->qrtr_tx_flow, GFP_KERNEL); mutex_init(&node->qrtr_tx_lock); qrtr_node_assign(node, nid); mutex_lock(&qrtr_node_lock); list_add(&node->item, &qrtr_all_nodes); mutex_unlock(&qrtr_node_lock); ep->node = node; return 0; } EXPORT_SYMBOL_GPL(qrtr_endpoint_register); /** * qrtr_endpoint_unregister - unregister endpoint * @ep: endpoint to unregister */ void qrtr_endpoint_unregister(struct qrtr_endpoint *ep) { struct qrtr_node *node = ep->node; struct sockaddr_qrtr src = {AF_QIPCRTR, node->nid, QRTR_PORT_CTRL}; struct sockaddr_qrtr dst = {AF_QIPCRTR, qrtr_local_nid, QRTR_PORT_CTRL}; struct radix_tree_iter iter; struct qrtr_ctrl_pkt *pkt; struct qrtr_tx_flow *flow; struct sk_buff *skb; unsigned long flags; void __rcu **slot; mutex_lock(&node->ep_lock); node->ep = NULL; mutex_unlock(&node->ep_lock); /* Notify the local controller about the event */ spin_lock_irqsave(&qrtr_nodes_lock, flags); radix_tree_for_each_slot(slot, &qrtr_nodes, &iter, 0) { if (*slot != node) continue; src.sq_node = iter.index; skb = qrtr_alloc_ctrl_packet(&pkt, GFP_ATOMIC); if (skb) { pkt->cmd = cpu_to_le32(QRTR_TYPE_BYE); qrtr_local_enqueue(NULL, skb, QRTR_TYPE_BYE, &src, &dst); } } spin_unlock_irqrestore(&qrtr_nodes_lock, flags); /* Wake up any transmitters waiting for resume-tx from the node */ mutex_lock(&node->qrtr_tx_lock); radix_tree_for_each_slot(slot, &node->qrtr_tx_flow, &iter, 0) { flow = *slot; wake_up_interruptible_all(&flow->resume_tx); } mutex_unlock(&node->qrtr_tx_lock); qrtr_node_release(node); ep->node = NULL; } EXPORT_SYMBOL_GPL(qrtr_endpoint_unregister); /* Lookup socket by port. * * Callers must release with qrtr_port_put() */ static struct qrtr_sock *qrtr_port_lookup(int port) { struct qrtr_sock *ipc; if (port == QRTR_PORT_CTRL) port = 0; rcu_read_lock(); ipc = xa_load(&qrtr_ports, port); if (ipc) sock_hold(&ipc->sk); rcu_read_unlock(); return ipc; } /* Release acquired socket. */ static void qrtr_port_put(struct qrtr_sock *ipc) { sock_put(&ipc->sk); } /* Remove port assignment. */ static void qrtr_port_remove(struct qrtr_sock *ipc) { struct qrtr_ctrl_pkt *pkt; struct sk_buff *skb; int port = ipc->us.sq_port; struct sockaddr_qrtr to; to.sq_family = AF_QIPCRTR; to.sq_node = QRTR_NODE_BCAST; to.sq_port = QRTR_PORT_CTRL; skb = qrtr_alloc_ctrl_packet(&pkt, GFP_KERNEL); if (skb) { pkt->cmd = cpu_to_le32(QRTR_TYPE_DEL_CLIENT); pkt->client.node = cpu_to_le32(ipc->us.sq_node); pkt->client.port = cpu_to_le32(ipc->us.sq_port); skb_set_owner_w(skb, &ipc->sk); qrtr_bcast_enqueue(NULL, skb, QRTR_TYPE_DEL_CLIENT, &ipc->us, &to); } if (port == QRTR_PORT_CTRL) port = 0; __sock_put(&ipc->sk); xa_erase(&qrtr_ports, port); /* Ensure that if qrtr_port_lookup() did enter the RCU read section we * wait for it to up increment the refcount */ synchronize_rcu(); } /* Assign port number to socket. * * Specify port in the integer pointed to by port, and it will be adjusted * on return as necesssary. * * Port may be: * 0: Assign ephemeral port in [QRTR_MIN_EPH_SOCKET, QRTR_MAX_EPH_SOCKET] * <QRTR_MIN_EPH_SOCKET: Specified; requires CAP_NET_ADMIN * >QRTR_MIN_EPH_SOCKET: Specified; available to all */ static int qrtr_port_assign(struct qrtr_sock *ipc, int *port) { int rc; if (!*port) { rc = xa_alloc(&qrtr_ports, port, ipc, QRTR_EPH_PORT_RANGE, GFP_KERNEL); } else if (*port < QRTR_MIN_EPH_SOCKET && !capable(CAP_NET_ADMIN)) { rc = -EACCES; } else if (*port == QRTR_PORT_CTRL) { rc = xa_insert(&qrtr_ports, 0, ipc, GFP_KERNEL); } else { rc = xa_insert(&qrtr_ports, *port, ipc, GFP_KERNEL); } if (rc == -EBUSY) return -EADDRINUSE; else if (rc < 0) return rc; sock_hold(&ipc->sk); return 0; } /* Reset all non-control ports */ static void qrtr_reset_ports(void) { struct qrtr_sock *ipc; unsigned long index; rcu_read_lock(); xa_for_each_start(&qrtr_ports, index, ipc, 1) { sock_hold(&ipc->sk); ipc->sk.sk_err = ENETRESET; sk_error_report(&ipc->sk); sock_put(&ipc->sk); } rcu_read_unlock(); } /* Bind socket to address. * * Socket should be locked upon call. */ static int __qrtr_bind(struct socket *sock, const struct sockaddr_qrtr *addr, int zapped) { struct qrtr_sock *ipc = qrtr_sk(sock->sk); struct sock *sk = sock->sk; int port; int rc; /* rebinding ok */ if (!zapped && addr->sq_port == ipc->us.sq_port) return 0; port = addr->sq_port; rc = qrtr_port_assign(ipc, &port); if (rc) return rc; /* unbind previous, if any */ if (!zapped) qrtr_port_remove(ipc); ipc->us.sq_port = port; sock_reset_flag(sk, SOCK_ZAPPED); /* Notify all open ports about the new controller */ if (port == QRTR_PORT_CTRL) qrtr_reset_ports(); return 0; } /* Auto bind to an ephemeral port. */ static int qrtr_autobind(struct socket *sock) { struct sock *sk = sock->sk; struct sockaddr_qrtr addr; if (!sock_flag(sk, SOCK_ZAPPED)) return 0; addr.sq_family = AF_QIPCRTR; addr.sq_node = qrtr_local_nid; addr.sq_port = 0; return __qrtr_bind(sock, &addr, 1); } /* Bind socket to specified sockaddr. */ static int qrtr_bind(struct socket *sock, struct sockaddr_unsized *saddr, int len) { DECLARE_SOCKADDR(struct sockaddr_qrtr *, addr, saddr); struct qrtr_sock *ipc = qrtr_sk(sock->sk); struct sock *sk = sock->sk; int rc; if (len < sizeof(*addr) || addr->sq_family != AF_QIPCRTR) return -EINVAL; if (addr->sq_node != ipc->us.sq_node) return -EINVAL; lock_sock(sk); rc = __qrtr_bind(sock, addr, sock_flag(sk, SOCK_ZAPPED)); release_sock(sk); return rc; } /* Queue packet to local peer socket. */ static int qrtr_local_enqueue(struct qrtr_node *node, struct sk_buff *skb, int type, struct sockaddr_qrtr *from, struct sockaddr_qrtr *to) { struct qrtr_sock *ipc; struct qrtr_cb *cb; ipc = qrtr_port_lookup(to->sq_port); if (!ipc || &ipc->sk == skb->sk) { /* do not send to self */ if (ipc) qrtr_port_put(ipc); kfree_skb(skb); return -ENODEV; } cb = (struct qrtr_cb *)skb->cb; cb->src_node = from->sq_node; cb->src_port = from->sq_port; if (sock_queue_rcv_skb(&ipc->sk, skb)) { qrtr_port_put(ipc); kfree_skb(skb); return -ENOSPC; } qrtr_port_put(ipc); return 0; } /* Queue packet for broadcast. */ static int qrtr_bcast_enqueue(struct qrtr_node *node, struct sk_buff *skb, int type, struct sockaddr_qrtr *from, struct sockaddr_qrtr *to) { struct sk_buff *skbn; mutex_lock(&qrtr_node_lock); list_for_each_entry(node, &qrtr_all_nodes, item) { skbn = pskb_copy(skb, GFP_KERNEL); if (!skbn) break; skb_set_owner_w(skbn, skb->sk); qrtr_node_enqueue(node, skbn, type, from, to); } mutex_unlock(&qrtr_node_lock); qrtr_local_enqueue(NULL, skb, type, from, to); return 0; } static int qrtr_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { DECLARE_SOCKADDR(struct sockaddr_qrtr *, addr, msg->msg_name); int (*enqueue_fn)(struct qrtr_node *, struct sk_buff *, int, struct sockaddr_qrtr *, struct sockaddr_qrtr *); __le32 qrtr_type = cpu_to_le32(QRTR_TYPE_DATA); struct qrtr_sock *ipc = qrtr_sk(sock->sk); struct sock *sk = sock->sk; struct qrtr_node *node; struct sk_buff *skb; size_t plen; u32 type; int rc; if (msg->msg_flags & ~(MSG_DONTWAIT)) return -EINVAL; if (len > 65535) return -EMSGSIZE; lock_sock(sk); if (addr) { if (msg->msg_namelen < sizeof(*addr)) { release_sock(sk); return -EINVAL; } if (addr->sq_family != AF_QIPCRTR) { release_sock(sk); return -EINVAL; } rc = qrtr_autobind(sock); if (rc) { release_sock(sk); return rc; } } else if (sk->sk_state == TCP_ESTABLISHED) { addr = &ipc->peer; } else { release_sock(sk); return -ENOTCONN; } node = NULL; if (addr->sq_node == QRTR_NODE_BCAST) { if (addr->sq_port != QRTR_PORT_CTRL && qrtr_local_nid != QRTR_NODE_BCAST) { release_sock(sk); return -ENOTCONN; } enqueue_fn = qrtr_bcast_enqueue; } else if (addr->sq_node == ipc->us.sq_node) { enqueue_fn = qrtr_local_enqueue; } else { node = qrtr_node_lookup(addr->sq_node); if (!node) { release_sock(sk); return -ECONNRESET; } enqueue_fn = qrtr_node_enqueue; } plen = (len + 3) & ~3; skb = sock_alloc_send_skb(sk, plen + QRTR_HDR_MAX_SIZE, msg->msg_flags & MSG_DONTWAIT, &rc); if (!skb) { rc = -ENOMEM; goto out_node; } skb_reserve(skb, QRTR_HDR_MAX_SIZE); rc = memcpy_from_msg(skb_put(skb, len), msg, len); if (rc) { kfree_skb(skb); goto out_node; } if (ipc->us.sq_port == QRTR_PORT_CTRL) { if (len < 4) { rc = -EINVAL; kfree_skb(skb); goto out_node; } /* control messages already require the type as 'command' */ skb_copy_bits(skb, 0, &qrtr_type, 4); } type = le32_to_cpu(qrtr_type); rc = enqueue_fn(node, skb, type, &ipc->us, addr); if (rc >= 0) rc = len; out_node: qrtr_node_release(node); release_sock(sk); return rc; } static int qrtr_send_resume_tx(struct qrtr_cb *cb) { struct sockaddr_qrtr remote = { AF_QIPCRTR, cb->src_node, cb->src_port }; struct sockaddr_qrtr local = { AF_QIPCRTR, cb->dst_node, cb->dst_port }; struct qrtr_ctrl_pkt *pkt; struct qrtr_node *node; struct sk_buff *skb; int ret; node = qrtr_node_lookup(remote.sq_node); if (!node) return -EINVAL; skb = qrtr_alloc_ctrl_packet(&pkt, GFP_KERNEL); if (!skb) return -ENOMEM; pkt->cmd = cpu_to_le32(QRTR_TYPE_RESUME_TX); pkt->client.node = cpu_to_le32(cb->dst_node); pkt->client.port = cpu_to_le32(cb->dst_port); ret = qrtr_node_enqueue(node, skb, QRTR_TYPE_RESUME_TX, &local, &remote); qrtr_node_release(node); return ret; } static int qrtr_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { DECLARE_SOCKADDR(struct sockaddr_qrtr *, addr, msg->msg_name); struct sock *sk = sock->sk; struct sk_buff *skb; struct qrtr_cb *cb; int copied, rc; lock_sock(sk); if (sock_flag(sk, SOCK_ZAPPED)) { release_sock(sk); return -EADDRNOTAVAIL; } skb = skb_recv_datagram(sk, flags, &rc); if (!skb) { release_sock(sk); return rc; } cb = (struct qrtr_cb *)skb->cb; copied = skb->len; if (copied > size) { copied = size; msg->msg_flags |= MSG_TRUNC; } rc = skb_copy_datagram_msg(skb, 0, msg, copied); if (rc < 0) goto out; rc = copied; if (addr) { /* There is an anonymous 2-byte hole after sq_family, * make sure to clear it. */ memset(addr, 0, sizeof(*addr)); addr->sq_family = AF_QIPCRTR; addr->sq_node = cb->src_node; addr->sq_port = cb->src_port; msg->msg_namelen = sizeof(*addr); } out: if (cb->confirm_rx) qrtr_send_resume_tx(cb); skb_free_datagram(sk, skb); release_sock(sk); return rc; } static int qrtr_connect(struct socket *sock, struct sockaddr_unsized *saddr, int len, int flags) { DECLARE_SOCKADDR(struct sockaddr_qrtr *, addr, saddr); struct qrtr_sock *ipc = qrtr_sk(sock->sk); struct sock *sk = sock->sk; int rc; if (len < sizeof(*addr) || addr->sq_family != AF_QIPCRTR) return -EINVAL; lock_sock(sk); sk->sk_state = TCP_CLOSE; sock->state = SS_UNCONNECTED; rc = qrtr_autobind(sock); if (rc) { release_sock(sk); return rc; } ipc->peer = *addr; sock->state = SS_CONNECTED; sk->sk_state = TCP_ESTABLISHED; release_sock(sk); return 0; } static int qrtr_getname(struct socket *sock, struct sockaddr *saddr, int peer) { struct qrtr_sock *ipc = qrtr_sk(sock->sk); struct sockaddr_qrtr qaddr; struct sock *sk = sock->sk; lock_sock(sk); if (peer) { if (sk->sk_state != TCP_ESTABLISHED) { release_sock(sk); return -ENOTCONN; } qaddr = ipc->peer; } else { qaddr = ipc->us; } release_sock(sk); qaddr.sq_family = AF_QIPCRTR; memcpy(saddr, &qaddr, sizeof(qaddr)); return sizeof(qaddr); } static int qrtr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { void __user *argp = (void __user *)arg; struct qrtr_sock *ipc = qrtr_sk(sock->sk); struct sock *sk = sock->sk; struct sockaddr_qrtr *sq; struct sk_buff *skb; struct ifreq ifr; long len = 0; int rc = 0; lock_sock(sk); switch (cmd) { case TIOCOUTQ: len = sk->sk_sndbuf - sk_wmem_alloc_get(sk); if (len < 0) len = 0; rc = put_user(len, (int __user *)argp); break; case TIOCINQ: skb = skb_peek(&sk->sk_receive_queue); if (skb) len = skb->len; rc = put_user(len, (int __user *)argp); break; case SIOCGIFADDR: if (get_user_ifreq(&ifr, NULL, argp)) { rc = -EFAULT; break; } sq = (struct sockaddr_qrtr *)&ifr.ifr_addr; *sq = ipc->us; if (put_user_ifreq(&ifr, argp)) { rc = -EFAULT; break; } break; case SIOCADDRT: case SIOCDELRT: case SIOCSIFADDR: case SIOCGIFDSTADDR: case SIOCSIFDSTADDR: case SIOCGIFBRDADDR: case SIOCSIFBRDADDR: case SIOCGIFNETMASK: case SIOCSIFNETMASK: rc = -EINVAL; break; default: rc = -ENOIOCTLCMD; break; } release_sock(sk); return rc; } static int qrtr_release(struct socket *sock) { struct sock *sk = sock->sk; struct qrtr_sock *ipc; if (!sk) return 0; lock_sock(sk); ipc = qrtr_sk(sk); sk->sk_shutdown = SHUTDOWN_MASK; if (!sock_flag(sk, SOCK_DEAD)) sk->sk_state_change(sk); sock_set_flag(sk, SOCK_DEAD); sock_orphan(sk); sock->sk = NULL; if (!sock_flag(sk, SOCK_ZAPPED)) qrtr_port_remove(ipc); skb_queue_purge(&sk->sk_receive_queue); release_sock(sk); sock_put(sk); return 0; } static const struct proto_ops qrtr_proto_ops = { .owner = THIS_MODULE, .family = AF_QIPCRTR, .bind = qrtr_bind, .connect = qrtr_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .listen = sock_no_listen, .sendmsg = qrtr_sendmsg, .recvmsg = qrtr_recvmsg, .getname = qrtr_getname, .ioctl = qrtr_ioctl, .gettstamp = sock_gettstamp, .poll = datagram_poll, .shutdown = sock_no_shutdown, .release = qrtr_release, .mmap = sock_no_mmap, }; static struct proto qrtr_proto = { .name = "QIPCRTR", .owner = THIS_MODULE, .obj_size = sizeof(struct qrtr_sock), }; static int qrtr_create(struct net *net, struct socket *sock, int protocol, int kern) { struct qrtr_sock *ipc; struct sock *sk; if (sock->type != SOCK_DGRAM) return -EPROTOTYPE; sk = sk_alloc(net, AF_QIPCRTR, GFP_KERNEL, &qrtr_proto, kern); if (!sk) return -ENOMEM; sock_set_flag(sk, SOCK_ZAPPED); sock_init_data(sock, sk); sock->ops = &qrtr_proto_ops; ipc = qrtr_sk(sk); ipc->us.sq_family = AF_QIPCRTR; ipc->us.sq_node = qrtr_local_nid; ipc->us.sq_port = 0; return 0; } static const struct net_proto_family qrtr_family = { .owner = THIS_MODULE, .family = AF_QIPCRTR, .create = qrtr_create, }; static int __init qrtr_proto_init(void) { int rc; rc = proto_register(&qrtr_proto, 1); if (rc) return rc; rc = sock_register(&qrtr_family); if (rc) goto err_proto; rc = qrtr_ns_init(); if (rc) goto err_sock; return 0; err_sock: sock_unregister(qrtr_family.family); err_proto: proto_unregister(&qrtr_proto); return rc; } postcore_initcall(qrtr_proto_init); static void __exit qrtr_proto_fini(void) { qrtr_ns_remove(); sock_unregister(qrtr_family.family); proto_unregister(&qrtr_proto); } module_exit(qrtr_proto_fini); MODULE_DESCRIPTION("Qualcomm IPC-router driver"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS_NETPROTO(PF_QIPCRTR);
3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PIPE_FS_I_H #define _LINUX_PIPE_FS_I_H #define PIPE_DEF_BUFFERS 16 #define PIPE_BUF_FLAG_LRU 0x01 /* page is on the LRU */ #define PIPE_BUF_FLAG_ATOMIC 0x02 /* was atomically mapped */ #define PIPE_BUF_FLAG_GIFT 0x04 /* page is a gift */ #define PIPE_BUF_FLAG_PACKET 0x08 /* read() as a packet */ #define PIPE_BUF_FLAG_CAN_MERGE 0x10 /* can merge buffers */ #define PIPE_BUF_FLAG_WHOLE 0x20 /* read() must return entire buffer or error */ #ifdef CONFIG_WATCH_QUEUE #define PIPE_BUF_FLAG_LOSS 0x40 /* Message loss happened after this buffer */ #endif /** * struct pipe_buffer - a linux kernel pipe buffer * @page: the page containing the data for the pipe buffer * @offset: offset of data inside the @page * @len: length of data inside the @page * @ops: operations associated with this buffer. See @pipe_buf_operations. * @flags: pipe buffer flags. See above. * @private: private data owned by the ops. **/ struct pipe_buffer { struct page *page; unsigned int offset, len; const struct pipe_buf_operations *ops; unsigned int flags; unsigned long private; }; /* * Really only alpha needs 32-bit fields, but * might as well do it for 64-bit architectures * since that's what we've historically done, * and it makes 'head_tail' always be a simple * 'unsigned long'. */ #ifdef CONFIG_64BIT typedef unsigned int pipe_index_t; #else typedef unsigned short pipe_index_t; #endif /** * struct pipe_index - pipe indeces * @head: The point of buffer production * @tail: The point of buffer consumption * @head_tail: unsigned long union of @head and @tail */ union pipe_index { unsigned long head_tail; struct { pipe_index_t head; pipe_index_t tail; }; }; /** * struct pipe_inode_info - a linux kernel pipe * @mutex: mutex protecting the whole thing * @rd_wait: reader wait point in case of empty pipe * @wr_wait: writer wait point in case of full pipe * @pipe_index: the pipe indeces * @note_loss: The next read() should insert a data-lost message * @max_usage: The maximum number of slots that may be used in the ring * @ring_size: total number of buffers (should be a power of 2) * @nr_accounted: The amount this pipe accounts for in user->pipe_bufs * @tmp_page: cached released page * @readers: number of current readers of this pipe * @writers: number of current writers of this pipe * @files: number of struct file referring this pipe (protected by ->i_lock) * @r_counter: reader counter * @w_counter: writer counter * @poll_usage: is this pipe used for epoll, which has crazy wakeups? * @fasync_readers: reader side fasync * @fasync_writers: writer side fasync * @bufs: the circular array of pipe buffers * @user: the user who created this pipe * @watch_queue: If this pipe is a watch_queue, this is the stuff for that **/ struct pipe_inode_info { struct mutex mutex; wait_queue_head_t rd_wait, wr_wait; union pipe_index; unsigned int max_usage; unsigned int ring_size; unsigned int nr_accounted; unsigned int readers; unsigned int writers; unsigned int files; unsigned int r_counter; unsigned int w_counter; bool poll_usage; #ifdef CONFIG_WATCH_QUEUE bool note_loss; #endif struct page *tmp_page[2]; struct fasync_struct *fasync_readers; struct fasync_struct *fasync_writers; struct pipe_buffer *bufs; struct user_struct *user; #ifdef CONFIG_WATCH_QUEUE struct watch_queue *watch_queue; #endif }; /* * Note on the nesting of these functions: * * ->confirm() * ->try_steal() * * That is, ->try_steal() must be called on a confirmed buffer. See below for * the meaning of each operation. Also see the kerneldoc in fs/pipe.c for the * pipe and generic variants of these hooks. */ struct pipe_buf_operations { /* * ->confirm() verifies that the data in the pipe buffer is there * and that the contents are good. If the pages in the pipe belong * to a file system, we may need to wait for IO completion in this * hook. Returns 0 for good, or a negative error value in case of * error. If not present all pages are considered good. */ int (*confirm)(struct pipe_inode_info *, struct pipe_buffer *); /* * When the contents of this pipe buffer has been completely * consumed by a reader, ->release() is called. */ void (*release)(struct pipe_inode_info *, struct pipe_buffer *); /* * Attempt to take ownership of the pipe buffer and its contents. * ->try_steal() returns %true for success, in which case the contents * of the pipe (the buf->page) is locked and now completely owned by the * caller. The page may then be transferred to a different mapping, the * most often used case is insertion into different file address space * cache. */ bool (*try_steal)(struct pipe_inode_info *, struct pipe_buffer *); /* * Get a reference to the pipe buffer. */ bool (*get)(struct pipe_inode_info *, struct pipe_buffer *); }; /** * pipe_has_watch_queue - Check whether the pipe is a watch_queue, * i.e. it was created with O_NOTIFICATION_PIPE * @pipe: The pipe to check * * Return: true if pipe is a watch queue, false otherwise. */ static inline bool pipe_has_watch_queue(const struct pipe_inode_info *pipe) { #ifdef CONFIG_WATCH_QUEUE return pipe->watch_queue != NULL; #else return false; #endif } /** * pipe_occupancy - Return number of slots used in the pipe * @head: The pipe ring head pointer * @tail: The pipe ring tail pointer */ static inline unsigned int pipe_occupancy(unsigned int head, unsigned int tail) { return (pipe_index_t)(head - tail); } /** * pipe_empty - Return true if the pipe is empty * @head: The pipe ring head pointer * @tail: The pipe ring tail pointer */ static inline bool pipe_empty(unsigned int head, unsigned int tail) { return !pipe_occupancy(head, tail); } /** * pipe_full - Return true if the pipe is full * @head: The pipe ring head pointer * @tail: The pipe ring tail pointer * @limit: The maximum amount of slots available. */ static inline bool pipe_full(unsigned int head, unsigned int tail, unsigned int limit) { return pipe_occupancy(head, tail) >= limit; } /** * pipe_is_full - Return true if the pipe is full * @pipe: the pipe */ static inline bool pipe_is_full(const struct pipe_inode_info *pipe) { return pipe_full(pipe->head, pipe->tail, pipe->max_usage); } /** * pipe_is_empty - Return true if the pipe is empty * @pipe: the pipe */ static inline bool pipe_is_empty(const struct pipe_inode_info *pipe) { return pipe_empty(pipe->head, pipe->tail); } /** * pipe_buf_usage - Return how many pipe buffers are in use * @pipe: the pipe */ static inline unsigned int pipe_buf_usage(const struct pipe_inode_info *pipe) { return pipe_occupancy(pipe->head, pipe->tail); } /** * pipe_buf - Return the pipe buffer for the specified slot in the pipe ring * @pipe: The pipe to access * @slot: The slot of interest */ static inline struct pipe_buffer *pipe_buf(const struct pipe_inode_info *pipe, unsigned int slot) { return &pipe->bufs[slot & (pipe->ring_size - 1)]; } /** * pipe_head_buf - Return the pipe buffer at the head of the pipe ring * @pipe: The pipe to access */ static inline struct pipe_buffer *pipe_head_buf(const struct pipe_inode_info *pipe) { return pipe_buf(pipe, pipe->head); } /** * pipe_buf_get - get a reference to a pipe_buffer * @pipe: the pipe that the buffer belongs to * @buf: the buffer to get a reference to * * Return: %true if the reference was successfully obtained. */ static inline __must_check bool pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { return buf->ops->get(pipe, buf); } /** * pipe_buf_release - put a reference to a pipe_buffer * @pipe: the pipe that the buffer belongs to * @buf: the buffer to put a reference to */ static inline void pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { const struct pipe_buf_operations *ops = buf->ops; buf->ops = NULL; ops->release(pipe, buf); } /** * pipe_buf_confirm - verify contents of the pipe buffer * @pipe: the pipe that the buffer belongs to * @buf: the buffer to confirm */ static inline int pipe_buf_confirm(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { if (!buf->ops->confirm) return 0; return buf->ops->confirm(pipe, buf); } /** * pipe_buf_try_steal - attempt to take ownership of a pipe_buffer * @pipe: the pipe that the buffer belongs to * @buf: the buffer to attempt to steal */ static inline bool pipe_buf_try_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { if (!buf->ops->try_steal) return false; return buf->ops->try_steal(pipe, buf); } /* Differs from PIPE_BUF in that PIPE_SIZE is the length of the actual memory allocation, whereas PIPE_BUF makes atomicity guarantees. */ #define PIPE_SIZE PAGE_SIZE /* Pipe lock and unlock operations */ void pipe_lock(struct pipe_inode_info *); void pipe_unlock(struct pipe_inode_info *); void pipe_double_lock(struct pipe_inode_info *, struct pipe_inode_info *); /* Wait for a pipe to be readable/writable while dropping the pipe lock */ void pipe_wait_readable(struct pipe_inode_info *); void pipe_wait_writable(struct pipe_inode_info *); struct pipe_inode_info *alloc_pipe_info(void); void free_pipe_info(struct pipe_inode_info *); /* Generic pipe buffer ops functions */ bool generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *); bool generic_pipe_buf_try_steal(struct pipe_inode_info *, struct pipe_buffer *); void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *); extern const struct pipe_buf_operations nosteal_pipe_buf_ops; unsigned long account_pipe_buffers(struct user_struct *user, unsigned long old, unsigned long new); bool too_many_pipe_buffers_soft(unsigned long user_bufs); bool too_many_pipe_buffers_hard(unsigned long user_bufs); bool pipe_is_unprivileged_user(void); /* for F_SETPIPE_SZ and F_GETPIPE_SZ */ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots); long pipe_fcntl(struct file *, unsigned int, unsigned int arg); struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice); int create_pipe_files(struct file **, int); unsigned int round_pipe_size(unsigned int size); #endif
302 190 64 2540 567 2517 368 588 2517 366 534 533 534 534 1225 139 1179 243 22 16 20 22 16 20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 /* SPDX-License-Identifier: GPL-2.0-or-later */ #ifndef _NET_NETDEV_LOCK_H #define _NET_NETDEV_LOCK_H #include <linux/lockdep.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> static inline bool netdev_trylock(struct net_device *dev) { return mutex_trylock(&dev->lock); } static inline void netdev_assert_locked(const struct net_device *dev) { lockdep_assert_held(&dev->lock); } static inline void netdev_assert_locked_or_invisible(const struct net_device *dev) { if (dev->reg_state == NETREG_REGISTERED || dev->reg_state == NETREG_UNREGISTERING) netdev_assert_locked(dev); } static inline bool netdev_need_ops_lock(const struct net_device *dev) { bool ret = dev->request_ops_lock || !!dev->queue_mgmt_ops; #if IS_ENABLED(CONFIG_NET_SHAPER) ret |= !!dev->netdev_ops->net_shaper_ops; #endif return ret; } static inline void netdev_lock_ops(struct net_device *dev) { if (netdev_need_ops_lock(dev)) netdev_lock(dev); } static inline void netdev_unlock_ops(struct net_device *dev) { if (netdev_need_ops_lock(dev)) netdev_unlock(dev); } static inline void netdev_lock_ops_to_full(struct net_device *dev) { if (netdev_need_ops_lock(dev)) netdev_assert_locked(dev); else netdev_lock(dev); } static inline void netdev_unlock_full_to_ops(struct net_device *dev) { if (netdev_need_ops_lock(dev)) netdev_assert_locked(dev); else netdev_unlock(dev); } static inline void netdev_ops_assert_locked(const struct net_device *dev) { if (netdev_need_ops_lock(dev)) lockdep_assert_held(&dev->lock); else ASSERT_RTNL(); } static inline void netdev_ops_assert_locked_or_invisible(const struct net_device *dev) { if (dev->reg_state == NETREG_REGISTERED || dev->reg_state == NETREG_UNREGISTERING) netdev_ops_assert_locked(dev); } static inline void netdev_lock_ops_compat(struct net_device *dev) { if (netdev_need_ops_lock(dev)) netdev_lock(dev); else rtnl_lock(); } static inline void netdev_unlock_ops_compat(struct net_device *dev) { if (netdev_need_ops_lock(dev)) netdev_unlock(dev); else rtnl_unlock(); } static inline int netdev_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b) { if (a == b) return 0; /* Allow locking multiple devices only under rtnl_lock, * the exact order doesn't matter. * Note that upper devices don't lock their ops, so nesting * mostly happens in batched device removal for now. */ return lockdep_rtnl_is_held() ? -1 : 1; } #define netdev_lockdep_set_classes(dev) \ { \ static struct lock_class_key qdisc_tx_busylock_key; \ static struct lock_class_key qdisc_xmit_lock_key; \ static struct lock_class_key dev_addr_list_lock_key; \ static struct lock_class_key dev_instance_lock_key; \ unsigned int i; \ \ (dev)->qdisc_tx_busylock = &qdisc_tx_busylock_key; \ lockdep_set_class(&(dev)->addr_list_lock, \ &dev_addr_list_lock_key); \ lockdep_set_class(&(dev)->lock, \ &dev_instance_lock_key); \ lock_set_cmp_fn(&dev->lock, netdev_lock_cmp_fn, NULL); \ for (i = 0; i < (dev)->num_tx_queues; i++) \ lockdep_set_class(&(dev)->_tx[i]._xmit_lock, \ &qdisc_xmit_lock_key); \ } #define netdev_lock_dereference(p, dev) \ rcu_dereference_protected(p, lockdep_is_held(&(dev)->lock)) int netdev_debug_event(struct notifier_block *nb, unsigned long event, void *ptr); #endif
10744 10858 3466 3465 182 182 1069 13424 4 2688 1303 1571 1338 1303 215 214 4 73 136 92 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_FIND_H_ #define __LINUX_FIND_H_ #ifndef __LINUX_BITMAP_H #error only <linux/bitmap.h> can be included directly #endif #include <linux/bitops.h> unsigned long _find_next_bit(const unsigned long *addr1, unsigned long nbits, unsigned long start); unsigned long _find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long nbits, unsigned long start); unsigned long _find_next_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long nbits, unsigned long start); unsigned long _find_next_or_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long nbits, unsigned long start); unsigned long _find_next_zero_bit(const unsigned long *addr, unsigned long nbits, unsigned long start); extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size); unsigned long __find_nth_bit(const unsigned long *addr, unsigned long size, unsigned long n); unsigned long __find_nth_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long n); unsigned long __find_nth_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long n); unsigned long __find_nth_and_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, const unsigned long *addr3, unsigned long size, unsigned long n); extern unsigned long _find_first_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size); unsigned long _find_first_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size); unsigned long _find_first_and_and_bit(const unsigned long *addr1, const unsigned long *addr2, const unsigned long *addr3, unsigned long size); extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size); extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long size); #ifdef __BIG_ENDIAN unsigned long _find_first_zero_bit_le(const unsigned long *addr, unsigned long size); unsigned long _find_next_zero_bit_le(const unsigned long *addr, unsigned long size, unsigned long offset); unsigned long _find_next_bit_le(const unsigned long *addr, unsigned long size, unsigned long offset); #endif unsigned long find_random_bit(const unsigned long *addr, unsigned long size); #ifndef find_next_bit /** * find_next_bit - find the next set bit in a memory region * @addr: The address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number for the next set bit * If no bits are set, returns @size. */ static __always_inline unsigned long find_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val; if (unlikely(offset >= size)) return size; val = *addr & GENMASK(size - 1, offset); return val ? __ffs(val) : size; } return _find_next_bit(addr, size, offset); } #endif #ifndef find_next_and_bit /** * find_next_and_bit - find the next set bit in both memory regions * @addr1: The first address to base the search on * @addr2: The second address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number for the next set bit * If no bits are set, returns @size. */ static __always_inline unsigned long find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val; if (unlikely(offset >= size)) return size; val = *addr1 & *addr2 & GENMASK(size - 1, offset); return val ? __ffs(val) : size; } return _find_next_and_bit(addr1, addr2, size, offset); } #endif #ifndef find_next_andnot_bit /** * find_next_andnot_bit - find the next set bit in *addr1 excluding all the bits * in *addr2 * @addr1: The first address to base the search on * @addr2: The second address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number for the next set bit * If no bits are set, returns @size. */ static __always_inline unsigned long find_next_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val; if (unlikely(offset >= size)) return size; val = *addr1 & ~*addr2 & GENMASK(size - 1, offset); return val ? __ffs(val) : size; } return _find_next_andnot_bit(addr1, addr2, size, offset); } #endif #ifndef find_next_or_bit /** * find_next_or_bit - find the next set bit in either memory regions * @addr1: The first address to base the search on * @addr2: The second address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number for the next set bit * If no bits are set, returns @size. */ static __always_inline unsigned long find_next_or_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val; if (unlikely(offset >= size)) return size; val = (*addr1 | *addr2) & GENMASK(size - 1, offset); return val ? __ffs(val) : size; } return _find_next_or_bit(addr1, addr2, size, offset); } #endif #ifndef find_next_zero_bit /** * find_next_zero_bit - find the next cleared bit in a memory region * @addr: The address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number of the next zero bit * If no bits are zero, returns @size. */ static __always_inline unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val; if (unlikely(offset >= size)) return size; val = *addr | ~GENMASK(size - 1, offset); return val == ~0UL ? size : ffz(val); } return _find_next_zero_bit(addr, size, offset); } #endif #ifndef find_first_bit /** * find_first_bit - find the first set bit in a memory region * @addr: The address to start the search at * @size: The maximum number of bits to search * * Returns the bit number of the first set bit. * If no bits are set, returns @size. */ static __always_inline unsigned long find_first_bit(const unsigned long *addr, unsigned long size) { if (small_const_nbits(size)) { unsigned long val = *addr & GENMASK(size - 1, 0); return val ? __ffs(val) : size; } return _find_first_bit(addr, size); } #endif /** * find_nth_bit - find N'th set bit in a memory region * @addr: The address to start the search at * @size: The maximum number of bits to search * @n: The number of set bit, which position is needed, counting from 0 * * The following is semantically equivalent: * idx = find_nth_bit(addr, size, 0); * idx = find_first_bit(addr, size); * * Returns the bit number of the N'th set bit. * If no such, returns >= @size. */ static __always_inline unsigned long find_nth_bit(const unsigned long *addr, unsigned long size, unsigned long n) { if (n >= size) return size; if (small_const_nbits(size)) { unsigned long val = *addr & GENMASK(size - 1, 0); return val ? fns(val, n) : size; } return __find_nth_bit(addr, size, n); } /** * find_nth_and_bit - find N'th set bit in 2 memory regions * @addr1: The 1st address to start the search at * @addr2: The 2nd address to start the search at * @size: The maximum number of bits to search * @n: The number of set bit, which position is needed, counting from 0 * * Returns the bit number of the N'th set bit. * If no such, returns @size. */ static __always_inline unsigned long find_nth_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long n) { if (n >= size) return size; if (small_const_nbits(size)) { unsigned long val = *addr1 & *addr2 & GENMASK(size - 1, 0); return val ? fns(val, n) : size; } return __find_nth_and_bit(addr1, addr2, size, n); } /** * find_nth_and_andnot_bit - find N'th set bit in 2 memory regions, * excluding those set in 3rd region * @addr1: The 1st address to start the search at * @addr2: The 2nd address to start the search at * @addr3: The 3rd address to start the search at * @size: The maximum number of bits to search * @n: The number of set bit, which position is needed, counting from 0 * * Returns the bit number of the N'th set bit. * If no such, returns @size. */ static __always_inline unsigned long find_nth_and_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, const unsigned long *addr3, unsigned long size, unsigned long n) { if (n >= size) return size; if (small_const_nbits(size)) { unsigned long val = *addr1 & *addr2 & (~*addr3) & GENMASK(size - 1, 0); return val ? fns(val, n) : size; } return __find_nth_and_andnot_bit(addr1, addr2, addr3, size, n); } #ifndef find_first_and_bit /** * find_first_and_bit - find the first set bit in both memory regions * @addr1: The first address to base the search on * @addr2: The second address to base the search on * @size: The bitmap size in bits * * Returns the bit number for the next set bit * If no bits are set, returns @size. */ static __always_inline unsigned long find_first_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size) { if (small_const_nbits(size)) { unsigned long val = *addr1 & *addr2 & GENMASK(size - 1, 0); return val ? __ffs(val) : size; } return _find_first_and_bit(addr1, addr2, size); } #endif /** * find_first_andnot_bit - find the first bit set in 1st memory region and unset in 2nd * @addr1: The first address to base the search on * @addr2: The second address to base the search on * @size: The bitmap size in bits * * Returns the bit number for the first set bit * If no bits are set, returns >= @size. */ static __always_inline unsigned long find_first_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size) { if (small_const_nbits(size)) { unsigned long val = *addr1 & (~*addr2) & GENMASK(size - 1, 0); return val ? __ffs(val) : size; } return _find_first_andnot_bit(addr1, addr2, size); } /** * find_first_and_and_bit - find the first set bit in 3 memory regions * @addr1: The first address to base the search on * @addr2: The second address to base the search on * @addr3: The third address to base the search on * @size: The bitmap size in bits * * Returns the bit number for the first set bit * If no bits are set, returns @size. */ static __always_inline unsigned long find_first_and_and_bit(const unsigned long *addr1, const unsigned long *addr2, const unsigned long *addr3, unsigned long size) { if (small_const_nbits(size)) { unsigned long val = *addr1 & *addr2 & *addr3 & GENMASK(size - 1, 0); return val ? __ffs(val) : size; } return _find_first_and_and_bit(addr1, addr2, addr3, size); } #ifndef find_first_zero_bit /** * find_first_zero_bit - find the first cleared bit in a memory region * @addr: The address to start the search at * @size: The maximum number of bits to search * * Returns the bit number of the first cleared bit. * If no bits are zero, returns @size. */ static __always_inline unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size) { if (small_const_nbits(size)) { unsigned long val = *addr | ~GENMASK(size - 1, 0); return val == ~0UL ? size : ffz(val); } return _find_first_zero_bit(addr, size); } #endif #ifndef find_last_bit /** * find_last_bit - find the last set bit in a memory region * @addr: The address to start the search at * @size: The number of bits to search * * Returns the bit number of the last set bit, or size. */ static __always_inline unsigned long find_last_bit(const unsigned long *addr, unsigned long size) { if (small_const_nbits(size)) { unsigned long val = *addr & GENMASK(size - 1, 0); return val ? __fls(val) : size; } return _find_last_bit(addr, size); } #endif /** * find_next_and_bit_wrap - find the next set bit in both memory regions * @addr1: The first address to base the search on * @addr2: The second address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number for the next set bit, or first set bit up to @offset * If no bits are set, returns @size. */ static __always_inline unsigned long find_next_and_bit_wrap(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long offset) { unsigned long bit = find_next_and_bit(addr1, addr2, size, offset); if (bit < size || offset == 0) return bit; bit = find_first_and_bit(addr1, addr2, offset); return bit < offset ? bit : size; } /** * find_next_bit_wrap - find the next set bit in a memory region * @addr: The address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number for the next set bit, or first set bit up to @offset * If no bits are set, returns @size. */ static __always_inline unsigned long find_next_bit_wrap(const unsigned long *addr, unsigned long size, unsigned long offset) { unsigned long bit = find_next_bit(addr, size, offset); if (bit < size || offset == 0) return bit; bit = find_first_bit(addr, offset); return bit < offset ? bit : size; } /* * Helper for for_each_set_bit_wrap(). Make sure you're doing right thing * before using it alone. */ static __always_inline unsigned long __for_each_wrap(const unsigned long *bitmap, unsigned long size, unsigned long start, unsigned long n) { unsigned long bit; /* If not wrapped around */ if (n > start) { /* and have a bit, just return it. */ bit = find_next_bit(bitmap, size, n); if (bit < size) return bit; /* Otherwise, wrap around and ... */ n = 0; } /* Search the other part. */ bit = find_next_bit(bitmap, start, n); return bit < start ? bit : size; } /** * find_next_clump8 - find next 8-bit clump with set bits in a memory region * @clump: location to store copy of found clump * @addr: address to base the search on * @size: bitmap size in number of bits * @offset: bit offset at which to start searching * * Returns the bit offset for the next set clump; the found clump value is * copied to the location pointed by @clump. If no bits are set, returns @size. */ extern unsigned long find_next_clump8(unsigned long *clump, const unsigned long *addr, unsigned long size, unsigned long offset); #define find_first_clump8(clump, bits, size) \ find_next_clump8((clump), (bits), (size), 0) #if defined(__LITTLE_ENDIAN) static __always_inline unsigned long find_next_zero_bit_le(const void *addr, unsigned long size, unsigned long offset) { return find_next_zero_bit(addr, size, offset); } static __always_inline unsigned long find_next_bit_le(const void *addr, unsigned long size, unsigned long offset) { return find_next_bit(addr, size, offset); } static __always_inline unsigned long find_first_zero_bit_le(const void *addr, unsigned long size) { return find_first_zero_bit(addr, size); } #elif defined(__BIG_ENDIAN) #ifndef find_next_zero_bit_le static __always_inline unsigned long find_next_zero_bit_le(const void *addr, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val = *(const unsigned long *)addr; if (unlikely(offset >= size)) return size; val = swab(val) | ~GENMASK(size - 1, offset); return val == ~0UL ? size : ffz(val); } return _find_next_zero_bit_le(addr, size, offset); } #endif #ifndef find_first_zero_bit_le static __always_inline unsigned long find_first_zero_bit_le(const void *addr, unsigned long size) { if (small_const_nbits(size)) { unsigned long val = swab(*(const unsigned long *)addr) | ~GENMASK(size - 1, 0); return val == ~0UL ? size : ffz(val); } return _find_first_zero_bit_le(addr, size); } #endif #ifndef find_next_bit_le static __always_inline unsigned long find_next_bit_le(const void *addr, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val = *(const unsigned long *)addr; if (unlikely(offset >= size)) return size; val = swab(val) & GENMASK(size - 1, offset); return val ? __ffs(val) : size; } return _find_next_bit_le(addr, size, offset); } #endif #else #error "Please fix <asm/byteorder.h>" #endif #define for_each_set_bit(bit, addr, size) \ for ((bit) = 0; (bit) = find_next_bit((addr), (size), (bit)), (bit) < (size); (bit)++) #define for_each_and_bit(bit, addr1, addr2, size) \ for ((bit) = 0; \ (bit) = find_next_and_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\ (bit)++) #define for_each_andnot_bit(bit, addr1, addr2, size) \ for ((bit) = 0; \ (bit) = find_next_andnot_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\ (bit)++) #define for_each_or_bit(bit, addr1, addr2, size) \ for ((bit) = 0; \ (bit) = find_next_or_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\ (bit)++) /* same as for_each_set_bit() but use bit as value to start with */ #define for_each_set_bit_from(bit, addr, size) \ for (; (bit) = find_next_bit((addr), (size), (bit)), (bit) < (size); (bit)++) #define for_each_clear_bit(bit, addr, size) \ for ((bit) = 0; \ (bit) = find_next_zero_bit((addr), (size), (bit)), (bit) < (size); \ (bit)++) /* same as for_each_clear_bit() but use bit as value to start with */ #define for_each_clear_bit_from(bit, addr, size) \ for (; (bit) = find_next_zero_bit((addr), (size), (bit)), (bit) < (size); (bit)++) /** * for_each_set_bitrange - iterate over all set bit ranges [b; e) * @b: bit offset of start of current bitrange (first set bit) * @e: bit offset of end of current bitrange (first unset bit) * @addr: bitmap address to base the search on * @size: bitmap size in number of bits */ #define for_each_set_bitrange(b, e, addr, size) \ for ((b) = 0; \ (b) = find_next_bit((addr), (size), b), \ (e) = find_next_zero_bit((addr), (size), (b) + 1), \ (b) < (size); \ (b) = (e) + 1) /** * for_each_set_bitrange_from - iterate over all set bit ranges [b; e) * @b: bit offset of start of current bitrange (first set bit); must be initialized * @e: bit offset of end of current bitrange (first unset bit) * @addr: bitmap address to base the search on * @size: bitmap size in number of bits */ #define for_each_set_bitrange_from(b, e, addr, size) \ for (; \ (b) = find_next_bit((addr), (size), (b)), \ (e) = find_next_zero_bit((addr), (size), (b) + 1), \ (b) < (size); \ (b) = (e) + 1) /** * for_each_clear_bitrange - iterate over all unset bit ranges [b; e) * @b: bit offset of start of current bitrange (first unset bit) * @e: bit offset of end of current bitrange (first set bit) * @addr: bitmap address to base the search on * @size: bitmap size in number of bits */ #define for_each_clear_bitrange(b, e, addr, size) \ for ((b) = 0; \ (b) = find_next_zero_bit((addr), (size), (b)), \ (e) = find_next_bit((addr), (size), (b) + 1), \ (b) < (size); \ (b) = (e) + 1) /** * for_each_clear_bitrange_from - iterate over all unset bit ranges [b; e) * @b: bit offset of start of current bitrange (first set bit); must be initialized * @e: bit offset of end of current bitrange (first unset bit) * @addr: bitmap address to base the search on * @size: bitmap size in number of bits */ #define for_each_clear_bitrange_from(b, e, addr, size) \ for (; \ (b) = find_next_zero_bit((addr), (size), (b)), \ (e) = find_next_bit((addr), (size), (b) + 1), \ (b) < (size); \ (b) = (e) + 1) /** * for_each_set_bit_wrap - iterate over all set bits starting from @start, and * wrapping around the end of bitmap. * @bit: offset for current iteration * @addr: bitmap address to base the search on * @size: bitmap size in number of bits * @start: Starting bit for bitmap traversing, wrapping around the bitmap end */ #define for_each_set_bit_wrap(bit, addr, size, start) \ for ((bit) = find_next_bit_wrap((addr), (size), (start)); \ (bit) < (size); \ (bit) = __for_each_wrap((addr), (size), (start), (bit) + 1)) /** * for_each_set_clump8 - iterate over bitmap for each 8-bit clump with set bits * @start: bit offset to start search and to store the current iteration offset * @clump: location to store copy of current 8-bit clump * @bits: bitmap address to base the search on * @size: bitmap size in number of bits */ #define for_each_set_clump8(start, clump, bits, size) \ for ((start) = find_first_clump8(&(clump), (bits), (size)); \ (start) < (size); \ (start) = find_next_clump8(&(clump), (bits), (size), (start) + 8)) #endif /*__LINUX_FIND_H_ */
138 138 8 8 8 8 16 138 138 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 // SPDX-License-Identifier: GPL-2.0 /* * Author: Andrei Vagin <avagin@openvz.org> * Author: Dmitry Safonov <dima@arista.com> */ #include <linux/time_namespace.h> #include <linux/user_namespace.h> #include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/clocksource.h> #include <linux/seq_file.h> #include <linux/proc_ns.h> #include <linux/export.h> #include <linux/nstree.h> #include <linux/time.h> #include <linux/slab.h> #include <linux/cred.h> #include <linux/err.h> #include <linux/mm.h> #include <vdso/datapage.h> ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim, struct timens_offsets *ns_offsets) { ktime_t offset; switch (clockid) { case CLOCK_MONOTONIC: offset = timespec64_to_ktime(ns_offsets->monotonic); break; case CLOCK_BOOTTIME: case CLOCK_BOOTTIME_ALARM: offset = timespec64_to_ktime(ns_offsets->boottime); break; default: return tim; } /* * Check that @tim value is in [offset, KTIME_MAX + offset] * and subtract offset. */ if (tim < offset) { /* * User can specify @tim *absolute* value - if it's lesser than * the time namespace's offset - it's already expired. */ tim = 0; } else { tim = ktime_sub(tim, offset); if (unlikely(tim > KTIME_MAX)) tim = KTIME_MAX; } return tim; } static struct ucounts *inc_time_namespaces(struct user_namespace *ns) { return inc_ucount(ns, current_euid(), UCOUNT_TIME_NAMESPACES); } static void dec_time_namespaces(struct ucounts *ucounts) { dec_ucount(ucounts, UCOUNT_TIME_NAMESPACES); } /** * clone_time_ns - Clone a time namespace * @user_ns: User namespace which owns a new namespace. * @old_ns: Namespace to clone * * Clone @old_ns and set the clone refcount to 1 * * Return: The new namespace or ERR_PTR. */ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, struct time_namespace *old_ns) { struct time_namespace *ns; struct ucounts *ucounts; int err; err = -ENOSPC; ucounts = inc_time_namespaces(user_ns); if (!ucounts) goto fail; err = -ENOMEM; ns = kzalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT); if (!ns) goto fail_dec; ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!ns->vvar_page) goto fail_free; err = ns_common_init(ns); if (err) goto fail_free_page; ns->ucounts = ucounts; ns->user_ns = get_user_ns(user_ns); ns->offsets = old_ns->offsets; ns->frozen_offsets = false; ns_tree_add(ns); return ns; fail_free_page: __free_page(ns->vvar_page); fail_free: kfree(ns); fail_dec: dec_time_namespaces(ucounts); fail: return ERR_PTR(err); } /** * copy_time_ns - Create timens_for_children from @old_ns * @flags: Cloning flags * @user_ns: User namespace which owns a new namespace. * @old_ns: Namespace to clone * * If CLONE_NEWTIME specified in @flags, creates a new timens_for_children; * adds a refcounter to @old_ns otherwise. * * Return: timens_for_children namespace or ERR_PTR. */ struct time_namespace *copy_time_ns(u64 flags, struct user_namespace *user_ns, struct time_namespace *old_ns) { if (!(flags & CLONE_NEWTIME)) return get_time_ns(old_ns); return clone_time_ns(user_ns, old_ns); } static struct timens_offset offset_from_ts(struct timespec64 off) { struct timens_offset ret; ret.sec = off.tv_sec; ret.nsec = off.tv_nsec; return ret; } /* * A time namespace VVAR page has the same layout as the VVAR page which * contains the system wide VDSO data. * * For a normal task the VVAR pages are installed in the normal ordering: * VVAR * PVCLOCK * HVCLOCK * TIMENS <- Not really required * * Now for a timens task the pages are installed in the following order: * TIMENS * PVCLOCK * HVCLOCK * VVAR * * The check for vdso_clock->clock_mode is in the unlikely path of * the seq begin magic. So for the non-timens case most of the time * 'seq' is even, so the branch is not taken. * * If 'seq' is odd, i.e. a concurrent update is in progress, the extra check * for vdso_clock->clock_mode is a non-issue. The task is spin waiting for the * update to finish and for 'seq' to become even anyway. * * Timens page has vdso_clock->clock_mode set to VDSO_CLOCKMODE_TIMENS which * enforces the time namespace handling path. */ static void timens_setup_vdso_clock_data(struct vdso_clock *vc, struct time_namespace *ns) { struct timens_offset *offset = vc->offset; struct timens_offset monotonic = offset_from_ts(ns->offsets.monotonic); struct timens_offset boottime = offset_from_ts(ns->offsets.boottime); vc->seq = 1; vc->clock_mode = VDSO_CLOCKMODE_TIMENS; offset[CLOCK_MONOTONIC] = monotonic; offset[CLOCK_MONOTONIC_RAW] = monotonic; offset[CLOCK_MONOTONIC_COARSE] = monotonic; offset[CLOCK_BOOTTIME] = boottime; offset[CLOCK_BOOTTIME_ALARM] = boottime; } struct page *find_timens_vvar_page(struct vm_area_struct *vma) { if (likely(vma->vm_mm == current->mm)) return current->nsproxy->time_ns->vvar_page; /* * VM_PFNMAP | VM_IO protect .fault() handler from being called * through interfaces like /proc/$pid/mem or * process_vm_{readv,writev}() as long as there's no .access() * in special_mapping_vmops(). * For more details check_vma_flags() and __access_remote_vm() */ WARN(1, "vvar_page accessed remotely"); return NULL; } /* * Protects possibly multiple offsets writers racing each other * and tasks entering the namespace. */ static DEFINE_MUTEX(offset_lock); static void timens_set_vvar_page(struct task_struct *task, struct time_namespace *ns) { struct vdso_time_data *vdata; struct vdso_clock *vc; unsigned int i; if (ns == &init_time_ns) return; /* Fast-path, taken by every task in namespace except the first. */ if (likely(ns->frozen_offsets)) return; mutex_lock(&offset_lock); /* Nothing to-do: vvar_page has been already initialized. */ if (ns->frozen_offsets) goto out; ns->frozen_offsets = true; vdata = page_address(ns->vvar_page); vc = vdata->clock_data; for (i = 0; i < CS_BASES; i++) timens_setup_vdso_clock_data(&vc[i], ns); if (IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS)) { for (i = 0; i < ARRAY_SIZE(vdata->aux_clock_data); i++) timens_setup_vdso_clock_data(&vdata->aux_clock_data[i], ns); } out: mutex_unlock(&offset_lock); } void free_time_ns(struct time_namespace *ns) { ns_tree_remove(ns); dec_time_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_common_free(ns); __free_page(ns->vvar_page); /* Concurrent nstree traversal depends on a grace period. */ kfree_rcu(ns, ns.ns_rcu); } static struct ns_common *timens_get(struct task_struct *task) { struct time_namespace *ns = NULL; struct nsproxy *nsproxy; task_lock(task); nsproxy = task->nsproxy; if (nsproxy) { ns = nsproxy->time_ns; get_time_ns(ns); } task_unlock(task); return ns ? &ns->ns : NULL; } static struct ns_common *timens_for_children_get(struct task_struct *task) { struct time_namespace *ns = NULL; struct nsproxy *nsproxy; task_lock(task); nsproxy = task->nsproxy; if (nsproxy) { ns = nsproxy->time_ns_for_children; get_time_ns(ns); } task_unlock(task); return ns ? &ns->ns : NULL; } static void timens_put(struct ns_common *ns) { put_time_ns(to_time_ns(ns)); } void timens_commit(struct task_struct *tsk, struct time_namespace *ns) { timens_set_vvar_page(tsk, ns); vdso_join_timens(tsk, ns); } static int timens_install(struct nsset *nsset, struct ns_common *new) { struct nsproxy *nsproxy = nsset->nsproxy; struct time_namespace *ns = to_time_ns(new); if (!current_is_single_threaded()) return -EUSERS; if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) return -EPERM; get_time_ns(ns); put_time_ns(nsproxy->time_ns); nsproxy->time_ns = ns; get_time_ns(ns); put_time_ns(nsproxy->time_ns_for_children); nsproxy->time_ns_for_children = ns; return 0; } void timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk) { struct ns_common *nsc = &nsproxy->time_ns_for_children->ns; struct time_namespace *ns = to_time_ns(nsc); /* create_new_namespaces() already incremented the ref counter */ if (nsproxy->time_ns == nsproxy->time_ns_for_children) return; get_time_ns(ns); put_time_ns(nsproxy->time_ns); nsproxy->time_ns = ns; timens_commit(tsk, ns); } static struct user_namespace *timens_owner(struct ns_common *ns) { return to_time_ns(ns)->user_ns; } static void show_offset(struct seq_file *m, int clockid, struct timespec64 *ts) { char *clock; switch (clockid) { case CLOCK_BOOTTIME: clock = "boottime"; break; case CLOCK_MONOTONIC: clock = "monotonic"; break; default: clock = "unknown"; break; } seq_printf(m, "%-10s %10lld %9ld\n", clock, ts->tv_sec, ts->tv_nsec); } void proc_timens_show_offsets(struct task_struct *p, struct seq_file *m) { struct ns_common *ns; struct time_namespace *time_ns; ns = timens_for_children_get(p); if (!ns) return; time_ns = to_time_ns(ns); show_offset(m, CLOCK_MONOTONIC, &time_ns->offsets.monotonic); show_offset(m, CLOCK_BOOTTIME, &time_ns->offsets.boottime); put_time_ns(time_ns); } int proc_timens_set_offset(struct file *file, struct task_struct *p, struct proc_timens_offset *offsets, int noffsets) { struct ns_common *ns; struct time_namespace *time_ns; struct timespec64 tp; int i, err; ns = timens_for_children_get(p); if (!ns) return -ESRCH; time_ns = to_time_ns(ns); if (!file_ns_capable(file, time_ns->user_ns, CAP_SYS_TIME)) { put_time_ns(time_ns); return -EPERM; } for (i = 0; i < noffsets; i++) { struct proc_timens_offset *off = &offsets[i]; switch (off->clockid) { case CLOCK_MONOTONIC: ktime_get_ts64(&tp); break; case CLOCK_BOOTTIME: ktime_get_boottime_ts64(&tp); break; default: err = -EINVAL; goto out; } err = -ERANGE; if (off->val.tv_sec > KTIME_SEC_MAX || off->val.tv_sec < -KTIME_SEC_MAX) goto out; tp = timespec64_add(tp, off->val); /* * KTIME_SEC_MAX is divided by 2 to be sure that KTIME_MAX is * still unreachable. */ if (tp.tv_sec < 0 || tp.tv_sec > KTIME_SEC_MAX / 2) goto out; } mutex_lock(&offset_lock); if (time_ns->frozen_offsets) { err = -EACCES; goto out_unlock; } err = 0; /* Don't report errors after this line */ for (i = 0; i < noffsets; i++) { struct proc_timens_offset *off = &offsets[i]; struct timespec64 *offset = NULL; switch (off->clockid) { case CLOCK_MONOTONIC: offset = &time_ns->offsets.monotonic; break; case CLOCK_BOOTTIME: offset = &time_ns->offsets.boottime; break; } *offset = off->val; } out_unlock: mutex_unlock(&offset_lock); out: put_time_ns(time_ns); return err; } const struct proc_ns_operations timens_operations = { .name = "time", .get = timens_get, .put = timens_put, .install = timens_install, .owner = timens_owner, }; const struct proc_ns_operations timens_for_children_operations = { .name = "time_for_children", .real_ns_name = "time", .get = timens_for_children_get, .put = timens_put, .install = timens_install, .owner = timens_owner, }; struct time_namespace init_time_ns = { .ns = NS_COMMON_INIT(init_time_ns), .user_ns = &init_user_ns, .frozen_offsets = true, }; void __init time_ns_init(void) { ns_tree_add(&init_time_ns); }
129 284 1774 3 5 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_TIMEKEEPING_H #define _LINUX_TIMEKEEPING_H #include <linux/errno.h> #include <linux/clocksource_ids.h> #include <linux/ktime.h> /* Included from linux/ktime.h */ void timekeeping_init(void); extern int timekeeping_suspended; /* Architecture timer tick functions: */ extern void legacy_timer_tick(unsigned long ticks); /* * Get and set timeofday */ extern int do_settimeofday64(const struct timespec64 *ts); extern int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz); /* * ktime_get() family - read the current time in a multitude of ways. * * The default time reference is CLOCK_MONOTONIC, starting at * boot time but not counting the time spent in suspend. * For other references, use the functions with "real", "clocktai", * "boottime" and "raw" suffixes. * * To get the time in a different format, use the ones with * "ns", "ts64" and "seconds" suffix. * * See Documentation/core-api/timekeeping.rst for more details. */ /* * timespec64 based interfaces */ extern void ktime_get_raw_ts64(struct timespec64 *ts); extern void ktime_get_ts64(struct timespec64 *ts); extern void ktime_get_real_ts64(struct timespec64 *tv); extern void ktime_get_coarse_ts64(struct timespec64 *ts); extern void ktime_get_coarse_real_ts64(struct timespec64 *ts); extern void ktime_get_clock_ts64(clockid_t id, struct timespec64 *ts); /* Multigrain timestamp interfaces */ extern void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts); extern void ktime_get_real_ts64_mg(struct timespec64 *ts); extern unsigned long timekeeping_get_mg_floor_swaps(void); void getboottime64(struct timespec64 *ts); /* * time64_t base interfaces */ extern time64_t ktime_get_seconds(void); extern time64_t __ktime_get_real_seconds(void); extern time64_t ktime_get_real_seconds(void); /* * ktime_t based interfaces */ enum tk_offsets { TK_OFFS_REAL, TK_OFFS_BOOT, TK_OFFS_TAI, TK_OFFS_MAX, }; extern ktime_t ktime_get(void); extern ktime_t ktime_get_with_offset(enum tk_offsets offs); extern ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs); extern ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs); extern ktime_t ktime_get_raw(void); extern u32 ktime_get_resolution_ns(void); /** * ktime_get_real - get the real (wall-) time in ktime_t format * * Returns: real (wall) time in ktime_t format */ static inline ktime_t ktime_get_real(void) { return ktime_get_with_offset(TK_OFFS_REAL); } static inline ktime_t ktime_get_coarse_real(void) { return ktime_get_coarse_with_offset(TK_OFFS_REAL); } /** * ktime_get_boottime - Get monotonic time since boot in ktime_t format * * This is similar to CLOCK_MONTONIC/ktime_get, but also includes the * time spent in suspend. * * Returns: monotonic time since boot in ktime_t format */ static inline ktime_t ktime_get_boottime(void) { return ktime_get_with_offset(TK_OFFS_BOOT); } static inline ktime_t ktime_get_coarse_boottime(void) { return ktime_get_coarse_with_offset(TK_OFFS_BOOT); } /** * ktime_get_clocktai - Get the TAI time of day in ktime_t format * * Returns: the TAI time of day in ktime_t format */ static inline ktime_t ktime_get_clocktai(void) { return ktime_get_with_offset(TK_OFFS_TAI); } static inline ktime_t ktime_get_coarse_clocktai(void) { return ktime_get_coarse_with_offset(TK_OFFS_TAI); } static inline ktime_t ktime_get_coarse(void) { struct timespec64 ts; ktime_get_coarse_ts64(&ts); return timespec64_to_ktime(ts); } static inline u64 ktime_get_coarse_ns(void) { return ktime_to_ns(ktime_get_coarse()); } static inline u64 ktime_get_coarse_real_ns(void) { return ktime_to_ns(ktime_get_coarse_real()); } static inline u64 ktime_get_coarse_boottime_ns(void) { return ktime_to_ns(ktime_get_coarse_boottime()); } static inline u64 ktime_get_coarse_clocktai_ns(void) { return ktime_to_ns(ktime_get_coarse_clocktai()); } /** * ktime_mono_to_real - Convert monotonic time to clock realtime * @mono: monotonic time to convert * * Returns: time converted to realtime clock */ static inline ktime_t ktime_mono_to_real(ktime_t mono) { return ktime_mono_to_any(mono, TK_OFFS_REAL); } /** * ktime_get_ns - Get the current time in nanoseconds * * Returns: current time converted to nanoseconds */ static inline u64 ktime_get_ns(void) { return ktime_to_ns(ktime_get()); } /** * ktime_get_real_ns - Get the current real/wall time in nanoseconds * * Returns: current real time converted to nanoseconds */ static inline u64 ktime_get_real_ns(void) { return ktime_to_ns(ktime_get_real()); } /** * ktime_get_boottime_ns - Get the monotonic time since boot in nanoseconds * * Returns: current boottime converted to nanoseconds */ static inline u64 ktime_get_boottime_ns(void) { return ktime_to_ns(ktime_get_boottime()); } /** * ktime_get_clocktai_ns - Get the current TAI time of day in nanoseconds * * Returns: current TAI time converted to nanoseconds */ static inline u64 ktime_get_clocktai_ns(void) { return ktime_to_ns(ktime_get_clocktai()); } /** * ktime_get_raw_ns - Get the raw monotonic time in nanoseconds * * Returns: current raw monotonic time converted to nanoseconds */ static inline u64 ktime_get_raw_ns(void) { return ktime_to_ns(ktime_get_raw()); } extern u64 ktime_get_mono_fast_ns(void); extern u64 ktime_get_raw_fast_ns(void); extern u64 ktime_get_boot_fast_ns(void); extern u64 ktime_get_tai_fast_ns(void); extern u64 ktime_get_real_fast_ns(void); /* * timespec64/time64_t interfaces utilizing the ktime based ones * for API completeness, these could be implemented more efficiently * if needed. */ static inline void ktime_get_boottime_ts64(struct timespec64 *ts) { *ts = ktime_to_timespec64(ktime_get_boottime()); } static inline void ktime_get_coarse_boottime_ts64(struct timespec64 *ts) { *ts = ktime_to_timespec64(ktime_get_coarse_boottime()); } static inline time64_t ktime_get_boottime_seconds(void) { return ktime_divns(ktime_get_coarse_boottime(), NSEC_PER_SEC); } static inline void ktime_get_clocktai_ts64(struct timespec64 *ts) { *ts = ktime_to_timespec64(ktime_get_clocktai()); } static inline void ktime_get_coarse_clocktai_ts64(struct timespec64 *ts) { *ts = ktime_to_timespec64(ktime_get_coarse_clocktai()); } static inline time64_t ktime_get_clocktai_seconds(void) { return ktime_divns(ktime_get_coarse_clocktai(), NSEC_PER_SEC); } /* * RTC specific */ extern bool timekeeping_rtc_skipsuspend(void); extern bool timekeeping_rtc_skipresume(void); extern void timekeeping_inject_sleeptime64(const struct timespec64 *delta); /* * Auxiliary clock interfaces */ #ifdef CONFIG_POSIX_AUX_CLOCKS extern bool ktime_get_aux(clockid_t id, ktime_t *kt); extern bool ktime_get_aux_ts64(clockid_t id, struct timespec64 *kt); #else static inline bool ktime_get_aux(clockid_t id, ktime_t *kt) { return false; } static inline bool ktime_get_aux_ts64(clockid_t id, struct timespec64 *kt) { return false; } #endif /** * struct system_time_snapshot - simultaneous raw/real time capture with * counter value * @cycles: Clocksource counter value to produce the system times * @real: Realtime system time * @boot: Boot time * @raw: Monotonic raw system time * @cs_id: Clocksource ID * @clock_was_set_seq: The sequence number of clock-was-set events * @cs_was_changed_seq: The sequence number of clocksource change events */ struct system_time_snapshot { u64 cycles; ktime_t real; ktime_t boot; ktime_t raw; enum clocksource_ids cs_id; unsigned int clock_was_set_seq; u8 cs_was_changed_seq; }; /** * struct system_device_crosststamp - system/device cross-timestamp * (synchronized capture) * @device: Device time * @sys_realtime: Realtime simultaneous with device time * @sys_monoraw: Monotonic raw simultaneous with device time */ struct system_device_crosststamp { ktime_t device; ktime_t sys_realtime; ktime_t sys_monoraw; }; /** * struct system_counterval_t - system counter value with the ID of the * corresponding clocksource * @cycles: System counter value * @cs_id: Clocksource ID corresponding to system counter value. Used by * timekeeping code to verify comparability of two cycle values. * The default ID, CSID_GENERIC, does not identify a specific * clocksource. * @use_nsecs: @cycles is in nanoseconds. */ struct system_counterval_t { u64 cycles; enum clocksource_ids cs_id; bool use_nsecs; }; extern bool ktime_real_to_base_clock(ktime_t treal, enum clocksource_ids base_id, u64 *cycles); extern bool timekeeping_clocksource_has_base(enum clocksource_ids id); /* * Get cross timestamp between system clock and device clock */ extern int get_device_system_crosststamp( int (*get_time_fn)(ktime_t *device_time, struct system_counterval_t *system_counterval, void *ctx), void *ctx, struct system_time_snapshot *history, struct system_device_crosststamp *xtstamp); /* * Simultaneously snapshot realtime and monotonic raw clocks */ extern void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot); /* * Persistent clock related interfaces */ extern int persistent_clock_is_local; extern void read_persistent_clock64(struct timespec64 *ts); void read_persistent_wall_and_boot_offset(struct timespec64 *wall_clock, struct timespec64 *boot_offset); #ifdef CONFIG_GENERIC_CMOS_UPDATE extern int update_persistent_clock64(struct timespec64 now); #endif #endif
14 30 150 239 8 6 4 220 2 137 160 163 162 163 162 30 4 129 1 3 159 3 156 61 12 50 1 252 1 1 250 4 3 1 1 8 3 5 1 4 6 6 60 35 8 18 9 1 8 8 8 24 20 3 11 12 12 1 1 1 10 11 1 1 1 8 8 8 130 7 7 30 30 16 14 9 5 2 2 16 3 5 8 92 2 1 89 34 10 1 8 1 5 2 7 59 2 2 25 17 19 33 10 38 2 9 27 23 13 13 25 4 5 17 10 1 16 42 10 42 19 14 4 5 23 22 22 13 1 2 13 4 37 1 36 12 12 12 12 16 3 2 12 8 8 1 8 8 7 7 8 11 4 7 11 6 4 3 4 15 3 1 11 7 6 7 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com * Copyright (c) 2016,2017 Facebook */ #include <linux/bpf.h> #include <linux/btf.h> #include <linux/err.h> #include <linux/slab.h> #include <linux/mm.h> #include <linux/filter.h> #include <linux/perf_event.h> #include <uapi/linux/btf.h> #include <linux/rcupdate_trace.h> #include <linux/btf_ids.h> #include <crypto/sha2.h> #include "map_in_map.h" #define ARRAY_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_MMAPABLE | BPF_F_ACCESS_MASK | \ BPF_F_PRESERVE_ELEMS | BPF_F_INNER_MAP) static void bpf_array_free_percpu(struct bpf_array *array) { int i; for (i = 0; i < array->map.max_entries; i++) { free_percpu(array->pptrs[i]); cond_resched(); } } static int bpf_array_alloc_percpu(struct bpf_array *array) { void __percpu *ptr; int i; for (i = 0; i < array->map.max_entries; i++) { ptr = bpf_map_alloc_percpu(&array->map, array->elem_size, 8, GFP_USER | __GFP_NOWARN); if (!ptr) { bpf_array_free_percpu(array); return -ENOMEM; } array->pptrs[i] = ptr; cond_resched(); } return 0; } /* Called from syscall */ int array_map_alloc_check(union bpf_attr *attr) { bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; int numa_node = bpf_map_attr_numa_node(attr); /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || attr->value_size == 0 || attr->map_flags & ~ARRAY_CREATE_FLAG_MASK || !bpf_map_flags_access_ok(attr->map_flags) || (percpu && numa_node != NUMA_NO_NODE)) return -EINVAL; if (attr->map_type != BPF_MAP_TYPE_ARRAY && attr->map_flags & (BPF_F_MMAPABLE | BPF_F_INNER_MAP)) return -EINVAL; if (attr->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY && attr->map_flags & BPF_F_PRESERVE_ELEMS) return -EINVAL; /* avoid overflow on round_up(map->value_size) */ if (attr->value_size > INT_MAX) return -E2BIG; /* percpu map value size is bound by PCPU_MIN_UNIT_SIZE */ if (percpu && round_up(attr->value_size, 8) > PCPU_MIN_UNIT_SIZE) return -E2BIG; return 0; } static struct bpf_map *array_map_alloc(union bpf_attr *attr) { bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; int numa_node = bpf_map_attr_numa_node(attr); u32 elem_size, index_mask, max_entries; bool bypass_spec_v1 = bpf_bypass_spec_v1(NULL); u64 array_size, mask64; struct bpf_array *array; elem_size = round_up(attr->value_size, 8); max_entries = attr->max_entries; /* On 32 bit archs roundup_pow_of_two() with max_entries that has * upper most bit set in u32 space is undefined behavior due to * resulting 1U << 32, so do it manually here in u64 space. */ mask64 = fls_long(max_entries - 1); mask64 = 1ULL << mask64; mask64 -= 1; index_mask = mask64; if (!bypass_spec_v1) { /* round up array size to nearest power of 2, * since cpu will speculate within index_mask limits */ max_entries = index_mask + 1; /* Check for overflows. */ if (max_entries < attr->max_entries) return ERR_PTR(-E2BIG); } array_size = sizeof(*array); if (percpu) { array_size += (u64) max_entries * sizeof(void *); } else { /* rely on vmalloc() to return page-aligned memory and * ensure array->value is exactly page-aligned */ if (attr->map_flags & BPF_F_MMAPABLE) { array_size = PAGE_ALIGN(array_size); array_size += PAGE_ALIGN((u64) max_entries * elem_size); } else { array_size += (u64) max_entries * elem_size; } } /* allocate all map elements and zero-initialize them */ if (attr->map_flags & BPF_F_MMAPABLE) { void *data; /* kmalloc'ed memory can't be mmap'ed, use explicit vmalloc */ data = bpf_map_area_mmapable_alloc(array_size, numa_node); if (!data) return ERR_PTR(-ENOMEM); array = data + PAGE_ALIGN(sizeof(struct bpf_array)) - offsetof(struct bpf_array, value); } else { array = bpf_map_area_alloc(array_size, numa_node); } if (!array) return ERR_PTR(-ENOMEM); array->index_mask = index_mask; array->map.bypass_spec_v1 = bypass_spec_v1; /* copy mandatory map attributes */ bpf_map_init_from_attr(&array->map, attr); array->elem_size = elem_size; if (percpu && bpf_array_alloc_percpu(array)) { bpf_map_area_free(array); return ERR_PTR(-ENOMEM); } return &array->map; } static void *array_map_elem_ptr(struct bpf_array* array, u32 index) { return array->value + (u64)array->elem_size * index; } /* Called from syscall or from eBPF program */ static void *array_map_lookup_elem(struct bpf_map *map, void *key) { struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; if (unlikely(index >= array->map.max_entries)) return NULL; return array->value + (u64)array->elem_size * (index & array->index_mask); } static int array_map_get_hash(struct bpf_map *map, u32 hash_buf_size, void *hash_buf) { struct bpf_array *array = container_of(map, struct bpf_array, map); sha256(array->value, (u64)array->elem_size * array->map.max_entries, hash_buf); memcpy(array->map.sha, hash_buf, sizeof(array->map.sha)); return 0; } static int array_map_direct_value_addr(const struct bpf_map *map, u64 *imm, u32 off) { struct bpf_array *array = container_of(map, struct bpf_array, map); if (map->max_entries != 1) return -ENOTSUPP; if (off >= map->value_size) return -EINVAL; *imm = (unsigned long)array->value; return 0; } static int array_map_direct_value_meta(const struct bpf_map *map, u64 imm, u32 *off) { struct bpf_array *array = container_of(map, struct bpf_array, map); u64 base = (unsigned long)array->value; u64 range = array->elem_size; if (map->max_entries != 1) return -ENOTSUPP; if (imm < base || imm >= base + range) return -ENOENT; *off = imm - base; return 0; } /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */ static int array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_insn *insn = insn_buf; u32 elem_size = array->elem_size; const int ret = BPF_REG_0; const int map_ptr = BPF_REG_1; const int index = BPF_REG_2; if (map->map_flags & BPF_F_INNER_MAP) return -EOPNOTSUPP; *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); if (!map->bypass_spec_v1) { *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4); *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); } else { *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3); } if (is_power_of_2(elem_size)) { *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size)); } else { *insn++ = BPF_ALU64_IMM(BPF_MUL, ret, elem_size); } *insn++ = BPF_ALU64_REG(BPF_ADD, ret, map_ptr); *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); *insn++ = BPF_MOV64_IMM(ret, 0); return insn - insn_buf; } /* Called from eBPF program */ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key) { struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; if (unlikely(index >= array->map.max_entries)) return NULL; return this_cpu_ptr(array->pptrs[index & array->index_mask]); } /* emit BPF instructions equivalent to C code of percpu_array_map_lookup_elem() */ static int percpu_array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_insn *insn = insn_buf; if (!bpf_jit_supports_percpu_insn()) return -EOPNOTSUPP; if (map->map_flags & BPF_F_INNER_MAP) return -EOPNOTSUPP; BUILD_BUG_ON(offsetof(struct bpf_array, map) != 0); *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, offsetof(struct bpf_array, pptrs)); *insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_2, 0); if (!map->bypass_spec_v1) { *insn++ = BPF_JMP_IMM(BPF_JGE, BPF_REG_0, map->max_entries, 6); *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_0, array->index_mask); } else { *insn++ = BPF_JMP_IMM(BPF_JGE, BPF_REG_0, map->max_entries, 5); } *insn++ = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3); *insn++ = BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1); *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0); *insn++ = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0); *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); *insn++ = BPF_MOV64_IMM(BPF_REG_0, 0); return insn - insn_buf; } static void *percpu_array_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu) { struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; if (cpu >= nr_cpu_ids) return NULL; if (unlikely(index >= array->map.max_entries)) return NULL; return per_cpu_ptr(array->pptrs[index & array->index_mask], cpu); } int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) { struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; void __percpu *pptr; int cpu, off = 0; u32 size; if (unlikely(index >= array->map.max_entries)) return -ENOENT; /* per_cpu areas are zero-filled and bpf programs can only * access 'value_size' of them, so copying rounded areas * will not leak any kernel data */ size = array->elem_size; rcu_read_lock(); pptr = array->pptrs[index & array->index_mask]; for_each_possible_cpu(cpu) { copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu)); check_and_init_map_value(map, value + off); off += size; } rcu_read_unlock(); return 0; } /* Called from syscall */ int bpf_array_get_next_key(struct bpf_map *map, void *key, void *next_key) { u32 index = key ? *(u32 *)key : U32_MAX; u32 *next = (u32 *)next_key; if (index >= map->max_entries) { *next = 0; return 0; } if (index == map->max_entries - 1) return -ENOENT; *next = index + 1; return 0; } /* Called from syscall or from eBPF program */ static long array_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; char *val; if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST)) /* unknown flags */ return -EINVAL; if (unlikely(index >= array->map.max_entries)) /* all elements were pre-allocated, cannot insert a new one */ return -E2BIG; if (unlikely(map_flags & BPF_NOEXIST)) /* all elements already exist */ return -EEXIST; if (unlikely((map_flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK))) return -EINVAL; if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { val = this_cpu_ptr(array->pptrs[index & array->index_mask]); copy_map_value(map, val, value); bpf_obj_free_fields(array->map.record, val); } else { val = array->value + (u64)array->elem_size * (index & array->index_mask); if (map_flags & BPF_F_LOCK) copy_map_value_locked(map, val, value, false); else copy_map_value(map, val, value); bpf_obj_free_fields(array->map.record, val); } return 0; } int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; void __percpu *pptr; int cpu, off = 0; u32 size; if (unlikely(map_flags > BPF_EXIST)) /* unknown flags */ return -EINVAL; if (unlikely(index >= array->map.max_entries)) /* all elements were pre-allocated, cannot insert a new one */ return -E2BIG; if (unlikely(map_flags == BPF_NOEXIST)) /* all elements already exist */ return -EEXIST; /* the user space will provide round_up(value_size, 8) bytes that * will be copied into per-cpu area. bpf programs can only access * value_size of it. During lookup the same extra bytes will be * returned or zeros which were zero-filled by percpu_alloc, * so no kernel data leaks possible */ size = array->elem_size; rcu_read_lock(); pptr = array->pptrs[index & array->index_mask]; for_each_possible_cpu(cpu) { copy_map_value_long(map, per_cpu_ptr(pptr, cpu), value + off); bpf_obj_free_fields(array->map.record, per_cpu_ptr(pptr, cpu)); off += size; } rcu_read_unlock(); return 0; } /* Called from syscall or from eBPF program */ static long array_map_delete_elem(struct bpf_map *map, void *key) { return -EINVAL; } static void *array_map_vmalloc_addr(struct bpf_array *array) { return (void *)round_down((unsigned long)array, PAGE_SIZE); } static void array_map_free_internal_structs(struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); int i; /* We only free internal structs on uref dropping to zero */ if (!bpf_map_has_internal_structs(map)) return; for (i = 0; i < array->map.max_entries; i++) bpf_map_free_internal_structs(map, array_map_elem_ptr(array, i)); } /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ static void array_map_free(struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); int i; if (!IS_ERR_OR_NULL(map->record)) { if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { for (i = 0; i < array->map.max_entries; i++) { void __percpu *pptr = array->pptrs[i & array->index_mask]; int cpu; for_each_possible_cpu(cpu) { bpf_obj_free_fields(map->record, per_cpu_ptr(pptr, cpu)); cond_resched(); } } } else { for (i = 0; i < array->map.max_entries; i++) bpf_obj_free_fields(map->record, array_map_elem_ptr(array, i)); } } if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) bpf_array_free_percpu(array); if (array->map.map_flags & BPF_F_MMAPABLE) bpf_map_area_free(array_map_vmalloc_addr(array)); else bpf_map_area_free(array); } static void array_map_seq_show_elem(struct bpf_map *map, void *key, struct seq_file *m) { void *value; rcu_read_lock(); value = array_map_lookup_elem(map, key); if (!value) { rcu_read_unlock(); return; } if (map->btf_key_type_id) seq_printf(m, "%u: ", *(u32 *)key); btf_type_seq_show(map->btf, map->btf_value_type_id, value, m); seq_putc(m, '\n'); rcu_read_unlock(); } static void percpu_array_map_seq_show_elem(struct bpf_map *map, void *key, struct seq_file *m) { struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; void __percpu *pptr; int cpu; rcu_read_lock(); seq_printf(m, "%u: {\n", *(u32 *)key); pptr = array->pptrs[index & array->index_mask]; for_each_possible_cpu(cpu) { seq_printf(m, "\tcpu%d: ", cpu); btf_type_seq_show(map->btf, map->btf_value_type_id, per_cpu_ptr(pptr, cpu), m); seq_putc(m, '\n'); } seq_puts(m, "}\n"); rcu_read_unlock(); } static int array_map_check_btf(const struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) { /* One exception for keyless BTF: .bss/.data/.rodata map */ if (btf_type_is_void(key_type)) { if (map->map_type != BPF_MAP_TYPE_ARRAY || map->max_entries != 1) return -EINVAL; if (BTF_INFO_KIND(value_type->info) != BTF_KIND_DATASEC) return -EINVAL; return 0; } /* * Bpf array can only take a u32 key. This check makes sure * that the btf matches the attr used during map_create. */ if (!btf_type_is_i32(key_type)) return -EINVAL; return 0; } static int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) { struct bpf_array *array = container_of(map, struct bpf_array, map); pgoff_t pgoff = PAGE_ALIGN(sizeof(*array)) >> PAGE_SHIFT; if (!(map->map_flags & BPF_F_MMAPABLE)) return -EINVAL; if (vma->vm_pgoff * PAGE_SIZE + (vma->vm_end - vma->vm_start) > PAGE_ALIGN((u64)array->map.max_entries * array->elem_size)) return -EINVAL; return remap_vmalloc_range(vma, array_map_vmalloc_addr(array), vma->vm_pgoff + pgoff); } static bool array_map_meta_equal(const struct bpf_map *meta0, const struct bpf_map *meta1) { if (!bpf_map_meta_equal(meta0, meta1)) return false; return meta0->map_flags & BPF_F_INNER_MAP ? true : meta0->max_entries == meta1->max_entries; } struct bpf_iter_seq_array_map_info { struct bpf_map *map; void *percpu_value_buf; u32 index; }; static void *bpf_array_map_seq_start(struct seq_file *seq, loff_t *pos) { struct bpf_iter_seq_array_map_info *info = seq->private; struct bpf_map *map = info->map; struct bpf_array *array; u32 index; if (info->index >= map->max_entries) return NULL; if (*pos == 0) ++*pos; array = container_of(map, struct bpf_array, map); index = info->index & array->index_mask; if (info->percpu_value_buf) return (void *)(uintptr_t)array->pptrs[index]; return array_map_elem_ptr(array, index); } static void *bpf_array_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct bpf_iter_seq_array_map_info *info = seq->private; struct bpf_map *map = info->map; struct bpf_array *array; u32 index; ++*pos; ++info->index; if (info->index >= map->max_entries) return NULL; array = container_of(map, struct bpf_array, map); index = info->index & array->index_mask; if (info->percpu_value_buf) return (void *)(uintptr_t)array->pptrs[index]; return array_map_elem_ptr(array, index); } static int __bpf_array_map_seq_show(struct seq_file *seq, void *v) { struct bpf_iter_seq_array_map_info *info = seq->private; struct bpf_iter__bpf_map_elem ctx = {}; struct bpf_map *map = info->map; struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_iter_meta meta; struct bpf_prog *prog; int off = 0, cpu = 0; void __percpu *pptr; u32 size; meta.seq = seq; prog = bpf_iter_get_info(&meta, v == NULL); if (!prog) return 0; ctx.meta = &meta; ctx.map = info->map; if (v) { ctx.key = &info->index; if (!info->percpu_value_buf) { ctx.value = v; } else { pptr = (void __percpu *)(uintptr_t)v; size = array->elem_size; for_each_possible_cpu(cpu) { copy_map_value_long(map, info->percpu_value_buf + off, per_cpu_ptr(pptr, cpu)); check_and_init_map_value(map, info->percpu_value_buf + off); off += size; } ctx.value = info->percpu_value_buf; } } return bpf_iter_run_prog(prog, &ctx); } static int bpf_array_map_seq_show(struct seq_file *seq, void *v) { return __bpf_array_map_seq_show(seq, v); } static void bpf_array_map_seq_stop(struct seq_file *seq, void *v) { if (!v) (void)__bpf_array_map_seq_show(seq, NULL); } static int bpf_iter_init_array_map(void *priv_data, struct bpf_iter_aux_info *aux) { struct bpf_iter_seq_array_map_info *seq_info = priv_data; struct bpf_map *map = aux->map; struct bpf_array *array = container_of(map, struct bpf_array, map); void *value_buf; u32 buf_size; if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { buf_size = array->elem_size * num_possible_cpus(); value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN); if (!value_buf) return -ENOMEM; seq_info->percpu_value_buf = value_buf; } /* bpf_iter_attach_map() acquires a map uref, and the uref may be * released before or in the middle of iterating map elements, so * acquire an extra map uref for iterator. */ bpf_map_inc_with_uref(map); seq_info->map = map; return 0; } static void bpf_iter_fini_array_map(void *priv_data) { struct bpf_iter_seq_array_map_info *seq_info = priv_data; bpf_map_put_with_uref(seq_info->map); kfree(seq_info->percpu_value_buf); } static const struct seq_operations bpf_array_map_seq_ops = { .start = bpf_array_map_seq_start, .next = bpf_array_map_seq_next, .stop = bpf_array_map_seq_stop, .show = bpf_array_map_seq_show, }; static const struct bpf_iter_seq_info iter_seq_info = { .seq_ops = &bpf_array_map_seq_ops, .init_seq_private = bpf_iter_init_array_map, .fini_seq_private = bpf_iter_fini_array_map, .seq_priv_size = sizeof(struct bpf_iter_seq_array_map_info), }; static long bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback_fn, void *callback_ctx, u64 flags) { u32 i, key, num_elems = 0; struct bpf_array *array; bool is_percpu; u64 ret = 0; void *val; cant_migrate(); if (flags != 0) return -EINVAL; is_percpu = map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; array = container_of(map, struct bpf_array, map); for (i = 0; i < map->max_entries; i++) { if (is_percpu) val = this_cpu_ptr(array->pptrs[i]); else val = array_map_elem_ptr(array, i); num_elems++; key = i; ret = callback_fn((u64)(long)map, (u64)(long)&key, (u64)(long)val, (u64)(long)callback_ctx, 0); /* return value: 0 - continue, 1 - stop and return */ if (ret) break; } return num_elems; } static u64 array_map_mem_usage(const struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); bool percpu = map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; u32 elem_size = array->elem_size; u64 entries = map->max_entries; u64 usage = sizeof(*array); if (percpu) { usage += entries * sizeof(void *); usage += entries * elem_size * num_possible_cpus(); } else { if (map->map_flags & BPF_F_MMAPABLE) { usage = PAGE_ALIGN(usage); usage += PAGE_ALIGN(entries * elem_size); } else { usage += entries * elem_size; } } return usage; } BTF_ID_LIST_SINGLE(array_map_btf_ids, struct, bpf_array) const struct bpf_map_ops array_map_ops = { .map_meta_equal = array_map_meta_equal, .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = array_map_free, .map_get_next_key = bpf_array_get_next_key, .map_release_uref = array_map_free_internal_structs, .map_lookup_elem = array_map_lookup_elem, .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, .map_gen_lookup = array_map_gen_lookup, .map_direct_value_addr = array_map_direct_value_addr, .map_direct_value_meta = array_map_direct_value_meta, .map_mmap = array_map_mmap, .map_seq_show_elem = array_map_seq_show_elem, .map_check_btf = array_map_check_btf, .map_lookup_batch = generic_map_lookup_batch, .map_update_batch = generic_map_update_batch, .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_array_elem, .map_mem_usage = array_map_mem_usage, .map_btf_id = &array_map_btf_ids[0], .iter_seq_info = &iter_seq_info, .map_get_hash = &array_map_get_hash, }; const struct bpf_map_ops percpu_array_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = array_map_free, .map_get_next_key = bpf_array_get_next_key, .map_lookup_elem = percpu_array_map_lookup_elem, .map_gen_lookup = percpu_array_map_gen_lookup, .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, .map_lookup_percpu_elem = percpu_array_map_lookup_percpu_elem, .map_seq_show_elem = percpu_array_map_seq_show_elem, .map_check_btf = array_map_check_btf, .map_lookup_batch = generic_map_lookup_batch, .map_update_batch = generic_map_update_batch, .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_array_elem, .map_mem_usage = array_map_mem_usage, .map_btf_id = &array_map_btf_ids[0], .iter_seq_info = &iter_seq_info, }; static int fd_array_map_alloc_check(union bpf_attr *attr) { /* only file descriptors can be stored in this type of map */ if (attr->value_size != sizeof(u32)) return -EINVAL; /* Program read-only/write-only not supported for special maps yet. */ if (attr->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) return -EINVAL; return array_map_alloc_check(attr); } static void fd_array_map_free(struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); int i; /* make sure it's empty */ for (i = 0; i < array->map.max_entries; i++) BUG_ON(array->ptrs[i] != NULL); bpf_map_area_free(array); } static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key) { return ERR_PTR(-EOPNOTSUPP); } /* only called from syscall */ int bpf_fd_array_map_lookup_elem(struct bpf_map *map, void *key, u32 *value) { void **elem, *ptr; int ret = 0; if (!map->ops->map_fd_sys_lookup_elem) return -ENOTSUPP; rcu_read_lock(); elem = array_map_lookup_elem(map, key); if (elem && (ptr = READ_ONCE(*elem))) *value = map->ops->map_fd_sys_lookup_elem(ptr); else ret = -ENOENT; rcu_read_unlock(); return ret; } /* only called from syscall */ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, void *key, void *value, u64 map_flags) { struct bpf_array *array = container_of(map, struct bpf_array, map); void *new_ptr, *old_ptr; u32 index = *(u32 *)key, ufd; if (map_flags != BPF_ANY) return -EINVAL; if (index >= array->map.max_entries) return -E2BIG; ufd = *(u32 *)value; new_ptr = map->ops->map_fd_get_ptr(map, map_file, ufd); if (IS_ERR(new_ptr)) return PTR_ERR(new_ptr); if (map->ops->map_poke_run) { mutex_lock(&array->aux->poke_mutex); old_ptr = xchg(array->ptrs + index, new_ptr); map->ops->map_poke_run(map, index, old_ptr, new_ptr); mutex_unlock(&array->aux->poke_mutex); } else { old_ptr = xchg(array->ptrs + index, new_ptr); } if (old_ptr) map->ops->map_fd_put_ptr(map, old_ptr, true); return 0; } static long __fd_array_map_delete_elem(struct bpf_map *map, void *key, bool need_defer) { struct bpf_array *array = container_of(map, struct bpf_array, map); void *old_ptr; u32 index = *(u32 *)key; if (index >= array->map.max_entries) return -E2BIG; if (map->ops->map_poke_run) { mutex_lock(&array->aux->poke_mutex); old_ptr = xchg(array->ptrs + index, NULL); map->ops->map_poke_run(map, index, old_ptr, NULL); mutex_unlock(&array->aux->poke_mutex); } else { old_ptr = xchg(array->ptrs + index, NULL); } if (old_ptr) { map->ops->map_fd_put_ptr(map, old_ptr, need_defer); return 0; } else { return -ENOENT; } } static long fd_array_map_delete_elem(struct bpf_map *map, void *key) { return __fd_array_map_delete_elem(map, key, true); } static void *prog_fd_array_get_ptr(struct bpf_map *map, struct file *map_file, int fd) { struct bpf_prog *prog = bpf_prog_get(fd); bool is_extended; if (IS_ERR(prog)) return prog; if (prog->type == BPF_PROG_TYPE_EXT || !bpf_prog_map_compatible(map, prog)) { bpf_prog_put(prog); return ERR_PTR(-EINVAL); } mutex_lock(&prog->aux->ext_mutex); is_extended = prog->aux->is_extended; if (!is_extended) prog->aux->prog_array_member_cnt++; mutex_unlock(&prog->aux->ext_mutex); if (is_extended) { /* Extended prog can not be tail callee. It's to prevent a * potential infinite loop like: * tail callee prog entry -> tail callee prog subprog -> * freplace prog entry --tailcall-> tail callee prog entry. */ bpf_prog_put(prog); return ERR_PTR(-EBUSY); } return prog; } static void prog_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer) { struct bpf_prog *prog = ptr; mutex_lock(&prog->aux->ext_mutex); prog->aux->prog_array_member_cnt--; mutex_unlock(&prog->aux->ext_mutex); /* bpf_prog is freed after one RCU or tasks trace grace period */ bpf_prog_put(prog); } static u32 prog_fd_array_sys_lookup_elem(void *ptr) { return ((struct bpf_prog *)ptr)->aux->id; } /* decrement refcnt of all bpf_progs that are stored in this map */ static void bpf_fd_array_map_clear(struct bpf_map *map, bool need_defer) { struct bpf_array *array = container_of(map, struct bpf_array, map); int i; for (i = 0; i < array->map.max_entries; i++) __fd_array_map_delete_elem(map, &i, need_defer); } static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key, struct seq_file *m) { void **elem, *ptr; u32 prog_id; rcu_read_lock(); elem = array_map_lookup_elem(map, key); if (elem) { ptr = READ_ONCE(*elem); if (ptr) { seq_printf(m, "%u: ", *(u32 *)key); prog_id = prog_fd_array_sys_lookup_elem(ptr); btf_type_seq_show(map->btf, map->btf_value_type_id, &prog_id, m); seq_putc(m, '\n'); } } rcu_read_unlock(); } struct prog_poke_elem { struct list_head list; struct bpf_prog_aux *aux; }; static int prog_array_map_poke_track(struct bpf_map *map, struct bpf_prog_aux *prog_aux) { struct prog_poke_elem *elem; struct bpf_array_aux *aux; int ret = 0; aux = container_of(map, struct bpf_array, map)->aux; mutex_lock(&aux->poke_mutex); list_for_each_entry(elem, &aux->poke_progs, list) { if (elem->aux == prog_aux) goto out; } elem = kmalloc(sizeof(*elem), GFP_KERNEL); if (!elem) { ret = -ENOMEM; goto out; } INIT_LIST_HEAD(&elem->list); /* We must track the program's aux info at this point in time * since the program pointer itself may not be stable yet, see * also comment in prog_array_map_poke_run(). */ elem->aux = prog_aux; list_add_tail(&elem->list, &aux->poke_progs); out: mutex_unlock(&aux->poke_mutex); return ret; } static void prog_array_map_poke_untrack(struct bpf_map *map, struct bpf_prog_aux *prog_aux) { struct prog_poke_elem *elem, *tmp; struct bpf_array_aux *aux; aux = container_of(map, struct bpf_array, map)->aux; mutex_lock(&aux->poke_mutex); list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) { if (elem->aux == prog_aux) { list_del_init(&elem->list); kfree(elem); break; } } mutex_unlock(&aux->poke_mutex); } void __weak bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke, struct bpf_prog *new, struct bpf_prog *old) { WARN_ON_ONCE(1); } static void prog_array_map_poke_run(struct bpf_map *map, u32 key, struct bpf_prog *old, struct bpf_prog *new) { struct prog_poke_elem *elem; struct bpf_array_aux *aux; aux = container_of(map, struct bpf_array, map)->aux; WARN_ON_ONCE(!mutex_is_locked(&aux->poke_mutex)); list_for_each_entry(elem, &aux->poke_progs, list) { struct bpf_jit_poke_descriptor *poke; int i; for (i = 0; i < elem->aux->size_poke_tab; i++) { poke = &elem->aux->poke_tab[i]; /* Few things to be aware of: * * 1) We can only ever access aux in this context, but * not aux->prog since it might not be stable yet and * there could be danger of use after free otherwise. * 2) Initially when we start tracking aux, the program * is not JITed yet and also does not have a kallsyms * entry. We skip these as poke->tailcall_target_stable * is not active yet. The JIT will do the final fixup * before setting it stable. The various * poke->tailcall_target_stable are successively * activated, so tail call updates can arrive from here * while JIT is still finishing its final fixup for * non-activated poke entries. * 3) Also programs reaching refcount of zero while patching * is in progress is okay since we're protected under * poke_mutex and untrack the programs before the JIT * buffer is freed. */ if (!READ_ONCE(poke->tailcall_target_stable)) continue; if (poke->reason != BPF_POKE_REASON_TAIL_CALL) continue; if (poke->tail_call.map != map || poke->tail_call.key != key) continue; bpf_arch_poke_desc_update(poke, new, old); } } } static void prog_array_map_clear_deferred(struct work_struct *work) { struct bpf_map *map = container_of(work, struct bpf_array_aux, work)->map; bpf_fd_array_map_clear(map, true); bpf_map_put(map); } static void prog_array_map_clear(struct bpf_map *map) { struct bpf_array_aux *aux = container_of(map, struct bpf_array, map)->aux; bpf_map_inc(map); schedule_work(&aux->work); } static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr) { struct bpf_array_aux *aux; struct bpf_map *map; aux = kzalloc(sizeof(*aux), GFP_KERNEL_ACCOUNT); if (!aux) return ERR_PTR(-ENOMEM); INIT_WORK(&aux->work, prog_array_map_clear_deferred); INIT_LIST_HEAD(&aux->poke_progs); mutex_init(&aux->poke_mutex); map = array_map_alloc(attr); if (IS_ERR(map)) { kfree(aux); return map; } container_of(map, struct bpf_array, map)->aux = aux; aux->map = map; return map; } static void prog_array_map_free(struct bpf_map *map) { struct prog_poke_elem *elem, *tmp; struct bpf_array_aux *aux; aux = container_of(map, struct bpf_array, map)->aux; list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) { list_del_init(&elem->list); kfree(elem); } kfree(aux); fd_array_map_free(map); } /* prog_array->aux->{type,jited} is a runtime binding. * Doing static check alone in the verifier is not enough. * Thus, prog_array_map cannot be used as an inner_map * and map_meta_equal is not implemented. */ const struct bpf_map_ops prog_array_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = prog_array_map_alloc, .map_free = prog_array_map_free, .map_poke_track = prog_array_map_poke_track, .map_poke_untrack = prog_array_map_poke_untrack, .map_poke_run = prog_array_map_poke_run, .map_get_next_key = bpf_array_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = prog_fd_array_get_ptr, .map_fd_put_ptr = prog_fd_array_put_ptr, .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, .map_release_uref = prog_array_map_clear, .map_seq_show_elem = prog_array_map_seq_show_elem, .map_mem_usage = array_map_mem_usage, .map_btf_id = &array_map_btf_ids[0], }; static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, struct file *map_file) { struct bpf_event_entry *ee; ee = kzalloc(sizeof(*ee), GFP_KERNEL); if (ee) { ee->event = perf_file->private_data; ee->perf_file = perf_file; ee->map_file = map_file; } return ee; } static void __bpf_event_entry_free(struct rcu_head *rcu) { struct bpf_event_entry *ee; ee = container_of(rcu, struct bpf_event_entry, rcu); fput(ee->perf_file); kfree(ee); } static void bpf_event_entry_free_rcu(struct bpf_event_entry *ee) { call_rcu(&ee->rcu, __bpf_event_entry_free); } static void *perf_event_fd_array_get_ptr(struct bpf_map *map, struct file *map_file, int fd) { struct bpf_event_entry *ee; struct perf_event *event; struct file *perf_file; u64 value; perf_file = perf_event_get(fd); if (IS_ERR(perf_file)) return perf_file; ee = ERR_PTR(-EOPNOTSUPP); event = perf_file->private_data; if (perf_event_read_local(event, &value, NULL, NULL) == -EOPNOTSUPP) goto err_out; ee = bpf_event_entry_gen(perf_file, map_file); if (ee) return ee; ee = ERR_PTR(-ENOMEM); err_out: fput(perf_file); return ee; } static void perf_event_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer) { /* bpf_perf_event is freed after one RCU grace period */ bpf_event_entry_free_rcu(ptr); } static void perf_event_fd_array_release(struct bpf_map *map, struct file *map_file) { struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_event_entry *ee; int i; if (map->map_flags & BPF_F_PRESERVE_ELEMS) return; rcu_read_lock(); for (i = 0; i < array->map.max_entries; i++) { ee = READ_ONCE(array->ptrs[i]); if (ee && ee->map_file == map_file) __fd_array_map_delete_elem(map, &i, true); } rcu_read_unlock(); } static void perf_event_fd_array_map_free(struct bpf_map *map) { if (map->map_flags & BPF_F_PRESERVE_ELEMS) bpf_fd_array_map_clear(map, false); fd_array_map_free(map); } const struct bpf_map_ops perf_event_array_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = perf_event_fd_array_map_free, .map_get_next_key = bpf_array_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = perf_event_fd_array_get_ptr, .map_fd_put_ptr = perf_event_fd_array_put_ptr, .map_release = perf_event_fd_array_release, .map_check_btf = map_check_no_btf, .map_mem_usage = array_map_mem_usage, .map_btf_id = &array_map_btf_ids[0], }; #ifdef CONFIG_CGROUPS static void *cgroup_fd_array_get_ptr(struct bpf_map *map, struct file *map_file /* not used */, int fd) { return cgroup_get_from_fd(fd); } static void cgroup_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer) { /* cgroup_put free cgrp after a rcu grace period */ cgroup_put(ptr); } static void cgroup_fd_array_free(struct bpf_map *map) { bpf_fd_array_map_clear(map, false); fd_array_map_free(map); } const struct bpf_map_ops cgroup_array_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = cgroup_fd_array_free, .map_get_next_key = bpf_array_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = cgroup_fd_array_get_ptr, .map_fd_put_ptr = cgroup_fd_array_put_ptr, .map_check_btf = map_check_no_btf, .map_mem_usage = array_map_mem_usage, .map_btf_id = &array_map_btf_ids[0], }; #endif static struct bpf_map *array_of_map_alloc(union bpf_attr *attr) { struct bpf_map *map, *inner_map_meta; inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd); if (IS_ERR(inner_map_meta)) return inner_map_meta; map = array_map_alloc(attr); if (IS_ERR(map)) { bpf_map_meta_free(inner_map_meta); return map; } map->inner_map_meta = inner_map_meta; return map; } static void array_of_map_free(struct bpf_map *map) { /* map->inner_map_meta is only accessed by syscall which * is protected by fdget/fdput. */ bpf_map_meta_free(map->inner_map_meta); bpf_fd_array_map_clear(map, false); fd_array_map_free(map); } static void *array_of_map_lookup_elem(struct bpf_map *map, void *key) { struct bpf_map **inner_map = array_map_lookup_elem(map, key); if (!inner_map) return NULL; return READ_ONCE(*inner_map); } static int array_of_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_array *array = container_of(map, struct bpf_array, map); u32 elem_size = array->elem_size; struct bpf_insn *insn = insn_buf; const int ret = BPF_REG_0; const int map_ptr = BPF_REG_1; const int index = BPF_REG_2; *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); if (!map->bypass_spec_v1) { *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 6); *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); } else { *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5); } if (is_power_of_2(elem_size)) *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size)); else *insn++ = BPF_ALU64_IMM(BPF_MUL, ret, elem_size); *insn++ = BPF_ALU64_REG(BPF_ADD, ret, map_ptr); *insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0); *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1); *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); *insn++ = BPF_MOV64_IMM(ret, 0); return insn - insn_buf; } const struct bpf_map_ops array_of_maps_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_of_map_alloc, .map_free = array_of_map_free, .map_get_next_key = bpf_array_get_next_key, .map_lookup_elem = array_of_map_lookup_elem, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = bpf_map_fd_get_ptr, .map_fd_put_ptr = bpf_map_fd_put_ptr, .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, .map_gen_lookup = array_of_map_gen_lookup, .map_lookup_batch = generic_map_lookup_batch, .map_update_batch = generic_map_update_batch, .map_check_btf = map_check_no_btf, .map_mem_usage = array_map_mem_usage, .map_btf_id = &array_map_btf_ids[0], };
28 19 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM mctp #if !defined(_TRACE_MCTP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_MCTP_H #include <linux/tracepoint.h> #ifndef __TRACE_MCTP_ENUMS #define __TRACE_MCTP_ENUMS enum { MCTP_TRACE_KEY_TIMEOUT, MCTP_TRACE_KEY_REPLIED, MCTP_TRACE_KEY_INVALIDATED, MCTP_TRACE_KEY_CLOSED, MCTP_TRACE_KEY_DROPPED, }; #endif /* __TRACE_MCTP_ENUMS */ TRACE_DEFINE_ENUM(MCTP_TRACE_KEY_TIMEOUT); TRACE_DEFINE_ENUM(MCTP_TRACE_KEY_REPLIED); TRACE_DEFINE_ENUM(MCTP_TRACE_KEY_INVALIDATED); TRACE_DEFINE_ENUM(MCTP_TRACE_KEY_CLOSED); TRACE_DEFINE_ENUM(MCTP_TRACE_KEY_DROPPED); TRACE_EVENT(mctp_key_acquire, TP_PROTO(const struct mctp_sk_key *key), TP_ARGS(key), TP_STRUCT__entry( __field(__u8, paddr) __field(__u8, laddr) __field(__u8, tag) ), TP_fast_assign( __entry->paddr = key->peer_addr; __entry->laddr = key->local_addr; __entry->tag = key->tag; ), TP_printk("local %d, peer %d, tag %1x", __entry->laddr, __entry->paddr, __entry->tag ) ); TRACE_EVENT(mctp_key_release, TP_PROTO(const struct mctp_sk_key *key, int reason), TP_ARGS(key, reason), TP_STRUCT__entry( __field(__u8, paddr) __field(__u8, laddr) __field(__u8, tag) __field(int, reason) ), TP_fast_assign( __entry->paddr = key->peer_addr; __entry->laddr = key->local_addr; __entry->tag = key->tag; __entry->reason = reason; ), TP_printk("local %d, peer %d, tag %1x %s", __entry->laddr, __entry->paddr, __entry->tag, __print_symbolic(__entry->reason, { MCTP_TRACE_KEY_TIMEOUT, "timeout" }, { MCTP_TRACE_KEY_REPLIED, "replied" }, { MCTP_TRACE_KEY_INVALIDATED, "invalidated" }, { MCTP_TRACE_KEY_CLOSED, "closed" }, { MCTP_TRACE_KEY_DROPPED, "dropped" }) ) ); #endif #include <trace/define_trace.h>
2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 // SPDX-License-Identifier: GPL-2.0-or-later /* * X.25 Packet Layer release 002 * * This is ALPHA test software. This code may break your machine, randomly fail to work with new * releases, misbehave and/or generally screw up. It might even work. * * This code REQUIRES 2.1.15 or higher * * History * X.25 001 Jonathan Naylor Started coding. * 2000-09-04 Henner Eisen Prevent freeing a dangling skb. */ #define pr_fmt(fmt) "X25: " fmt #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <net/sock.h> #include <linux/if_arp.h> #include <net/x25.h> #include <net/x25device.h> static int x25_receive_data(struct sk_buff *skb, struct x25_neigh *nb) { struct sock *sk; unsigned short frametype; unsigned int lci; if (!pskb_may_pull(skb, X25_STD_MIN_LEN)) return 0; frametype = skb->data[2]; lci = ((skb->data[0] << 8) & 0xF00) + ((skb->data[1] << 0) & 0x0FF); /* * LCI of zero is always for us, and its always a link control * frame. */ if (lci == 0) { x25_link_control(skb, nb, frametype); return 0; } /* * Find an existing socket. */ if ((sk = x25_find_socket(lci, nb)) != NULL) { int queued = 1; skb_reset_transport_header(skb); bh_lock_sock(sk); if (!sock_owned_by_user(sk)) { queued = x25_process_rx_frame(sk, skb); } else { queued = !sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf)); } bh_unlock_sock(sk); sock_put(sk); return queued; } /* * Is is a Call Request ? if so process it. */ if (frametype == X25_CALL_REQUEST) return x25_rx_call_request(skb, nb, lci); /* * Its not a Call Request, nor is it a control frame. * Can we forward it? */ if (x25_forward_data(lci, nb, skb)) { if (frametype == X25_CLEAR_CONFIRMATION) { x25_clear_forward_by_lci(lci); } kfree_skb(skb); return 1; } /* x25_transmit_clear_request(nb, lci, 0x0D); */ if (frametype != X25_CLEAR_CONFIRMATION) pr_debug("x25_receive_data(): unknown frame type %2x\n",frametype); return 0; } int x25_lapb_receive_frame(struct sk_buff *skb, struct net_device *dev, struct packet_type *ptype, struct net_device *orig_dev) { struct sk_buff *nskb; struct x25_neigh *nb; if (!net_eq(dev_net(dev), &init_net)) goto drop; nskb = skb_copy(skb, GFP_ATOMIC); if (!nskb) goto drop; kfree_skb(skb); skb = nskb; /* * Packet received from unrecognised device, throw it away. */ nb = x25_get_neigh(dev); if (!nb) { pr_debug("unknown neighbour - %s\n", dev->name); goto drop; } if (!pskb_may_pull(skb, 1)) { x25_neigh_put(nb); goto drop; } switch (skb->data[0]) { case X25_IFACE_DATA: skb_pull(skb, 1); if (x25_receive_data(skb, nb)) { x25_neigh_put(nb); goto out; } break; case X25_IFACE_CONNECT: x25_link_established(nb); break; case X25_IFACE_DISCONNECT: x25_link_terminated(nb); break; } x25_neigh_put(nb); drop: kfree_skb(skb); out: return 0; } void x25_establish_link(struct x25_neigh *nb) { struct sk_buff *skb; unsigned char *ptr; switch (nb->dev->type) { case ARPHRD_X25: if ((skb = alloc_skb(1, GFP_ATOMIC)) == NULL) { pr_err("x25_dev: out of memory\n"); return; } ptr = skb_put(skb, 1); *ptr = X25_IFACE_CONNECT; break; default: return; } skb->protocol = htons(ETH_P_X25); skb->dev = nb->dev; dev_queue_xmit(skb); } void x25_send_frame(struct sk_buff *skb, struct x25_neigh *nb) { unsigned char *dptr; skb_reset_network_header(skb); switch (nb->dev->type) { case ARPHRD_X25: dptr = skb_push(skb, 1); *dptr = X25_IFACE_DATA; break; default: kfree_skb(skb); return; } skb->protocol = htons(ETH_P_X25); skb->dev = nb->dev; dev_queue_xmit(skb); }
243 245 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 // SPDX-License-Identifier: GPL-2.0 /* Lock down the kernel * * Copyright (C) 2016 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public Licence * as published by the Free Software Foundation; either version * 2 of the Licence, or (at your option) any later version. */ #include <linux/security.h> #include <linux/export.h> #include <linux/lsm_hooks.h> #include <uapi/linux/lsm.h> static enum lockdown_reason kernel_locked_down; static const enum lockdown_reason lockdown_levels[] = {LOCKDOWN_NONE, LOCKDOWN_INTEGRITY_MAX, LOCKDOWN_CONFIDENTIALITY_MAX}; /* * Put the kernel into lock-down mode. */ static int lock_kernel_down(const char *where, enum lockdown_reason level) { if (kernel_locked_down >= level) return -EPERM; kernel_locked_down = level; pr_notice("Kernel is locked down from %s; see man kernel_lockdown.7\n", where); return 0; } static int __init lockdown_param(char *level) { if (!level) return -EINVAL; if (strcmp(level, "integrity") == 0) lock_kernel_down("command line", LOCKDOWN_INTEGRITY_MAX); else if (strcmp(level, "confidentiality") == 0) lock_kernel_down("command line", LOCKDOWN_CONFIDENTIALITY_MAX); else return -EINVAL; return 0; } early_param("lockdown", lockdown_param); /** * lockdown_is_locked_down - Find out if the kernel is locked down * @what: Tag to use in notice generated if lockdown is in effect */ static int lockdown_is_locked_down(enum lockdown_reason what) { if (WARN(what >= LOCKDOWN_CONFIDENTIALITY_MAX, "Invalid lockdown reason")) return -EPERM; if (kernel_locked_down >= what) { if (lockdown_reasons[what]) pr_notice_ratelimited("Lockdown: %s: %s is restricted; see man kernel_lockdown.7\n", current->comm, lockdown_reasons[what]); return -EPERM; } return 0; } static struct security_hook_list lockdown_hooks[] __ro_after_init = { LSM_HOOK_INIT(locked_down, lockdown_is_locked_down), }; static const struct lsm_id lockdown_lsmid = { .name = "lockdown", .id = LSM_ID_LOCKDOWN, }; static int __init lockdown_lsm_init(void) { #if defined(CONFIG_LOCK_DOWN_KERNEL_FORCE_INTEGRITY) lock_kernel_down("Kernel configuration", LOCKDOWN_INTEGRITY_MAX); #elif defined(CONFIG_LOCK_DOWN_KERNEL_FORCE_CONFIDENTIALITY) lock_kernel_down("Kernel configuration", LOCKDOWN_CONFIDENTIALITY_MAX); #endif security_add_hooks(lockdown_hooks, ARRAY_SIZE(lockdown_hooks), &lockdown_lsmid); return 0; } static ssize_t lockdown_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos) { char temp[80] = ""; int i, offset = 0; for (i = 0; i < ARRAY_SIZE(lockdown_levels); i++) { enum lockdown_reason level = lockdown_levels[i]; if (lockdown_reasons[level]) { const char *label = lockdown_reasons[level]; if (kernel_locked_down == level) offset += sprintf(temp+offset, "[%s] ", label); else offset += sprintf(temp+offset, "%s ", label); } } /* Convert the last space to a newline if needed. */ if (offset > 0) temp[offset-1] = '\n'; return simple_read_from_buffer(buf, count, ppos, temp, strlen(temp)); } static ssize_t lockdown_write(struct file *file, const char __user *buf, size_t n, loff_t *ppos) { char *state; int i, len, err = -EINVAL; state = memdup_user_nul(buf, n); if (IS_ERR(state)) return PTR_ERR(state); len = strlen(state); if (len && state[len-1] == '\n') { state[len-1] = '\0'; len--; } for (i = 0; i < ARRAY_SIZE(lockdown_levels); i++) { enum lockdown_reason level = lockdown_levels[i]; const char *label = lockdown_reasons[level]; if (label && !strcmp(state, label)) err = lock_kernel_down("securityfs", level); } kfree(state); return err ? err : n; } static const struct file_operations lockdown_ops = { .read = lockdown_read, .write = lockdown_write, }; static int __init lockdown_secfs_init(void) { struct dentry *dentry; dentry = securityfs_create_file("lockdown", 0644, NULL, NULL, &lockdown_ops); return PTR_ERR_OR_ZERO(dentry); } #ifdef CONFIG_SECURITY_LOCKDOWN_LSM_EARLY DEFINE_EARLY_LSM(lockdown) = { #else DEFINE_LSM(lockdown) = { #endif .id = &lockdown_lsmid, .init = lockdown_lsm_init, .initcall_core = lockdown_secfs_init, };
70 70 70 46 47 107 107 61 61 61 24 40 40 30 29 31 72 72 49 24 107 105 3 2 2 2 60 60 120 107 107 60 107 107 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 // SPDX-License-Identifier: GPL-2.0 /* Multipath TCP * * Copyright (c) 2017 - 2019, Intel Corporation. */ #define pr_fmt(fmt) "MPTCP: " fmt #include <linux/kernel.h> #include <crypto/sha2.h> #include <net/tcp.h> #include <net/mptcp.h> #include "protocol.h" #include "mib.h" #include <trace/events/mptcp.h> static bool mptcp_cap_flag_sha256(u8 flags) { return (flags & MPTCP_CAP_FLAG_MASK) == MPTCP_CAP_HMAC_SHA256; } static void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr, int opsize, struct mptcp_options_received *mp_opt) { u8 subtype = *ptr >> 4; int expected_opsize; u16 subopt; u8 version; u8 flags; u8 i; switch (subtype) { case MPTCPOPT_MP_CAPABLE: /* strict size checking */ if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { if (skb->len > tcp_hdr(skb)->doff << 2) expected_opsize = TCPOLEN_MPTCP_MPC_ACK_DATA; else expected_opsize = TCPOLEN_MPTCP_MPC_ACK; subopt = OPTION_MPTCP_MPC_ACK; } else { if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK) { expected_opsize = TCPOLEN_MPTCP_MPC_SYNACK; subopt = OPTION_MPTCP_MPC_SYNACK; } else { expected_opsize = TCPOLEN_MPTCP_MPC_SYN; subopt = OPTION_MPTCP_MPC_SYN; } } /* Cfr RFC 8684 Section 3.3.0: * If a checksum is present but its use had * not been negotiated in the MP_CAPABLE handshake, the receiver MUST * close the subflow with a RST, as it is not behaving as negotiated. * If a checksum is not present when its use has been negotiated, the * receiver MUST close the subflow with a RST, as it is considered * broken * We parse even option with mismatching csum presence, so that * later in subflow_data_ready we can trigger the reset. */ if (opsize != expected_opsize && (expected_opsize != TCPOLEN_MPTCP_MPC_ACK_DATA || opsize != TCPOLEN_MPTCP_MPC_ACK_DATA_CSUM)) break; /* try to be gentle vs future versions on the initial syn */ version = *ptr++ & MPTCP_VERSION_MASK; if (opsize != TCPOLEN_MPTCP_MPC_SYN) { if (version != MPTCP_SUPPORTED_VERSION) break; } else if (version < MPTCP_SUPPORTED_VERSION) { break; } flags = *ptr++; if (!mptcp_cap_flag_sha256(flags) || (flags & MPTCP_CAP_EXTENSIBILITY)) break; /* RFC 6824, Section 3.1: * "For the Checksum Required bit (labeled "A"), if either * host requires the use of checksums, checksums MUST be used. * In other words, the only way for checksums not to be used * is if both hosts in their SYNs set A=0." */ if (flags & MPTCP_CAP_CHECKSUM_REQD) mp_opt->suboptions |= OPTION_MPTCP_CSUMREQD; mp_opt->deny_join_id0 = !!(flags & MPTCP_CAP_DENY_JOIN_ID0); mp_opt->suboptions |= subopt; if (opsize >= TCPOLEN_MPTCP_MPC_SYNACK) { mp_opt->sndr_key = get_unaligned_be64(ptr); ptr += 8; } if (opsize >= TCPOLEN_MPTCP_MPC_ACK) { mp_opt->rcvr_key = get_unaligned_be64(ptr); ptr += 8; } if (opsize >= TCPOLEN_MPTCP_MPC_ACK_DATA) { /* Section 3.1.: * "the data parameters in a MP_CAPABLE are semantically * equivalent to those in a DSS option and can be used * interchangeably." */ mp_opt->suboptions |= OPTION_MPTCP_DSS; mp_opt->use_map = 1; mp_opt->mpc_map = 1; mp_opt->data_len = get_unaligned_be16(ptr); ptr += 2; } if (opsize == TCPOLEN_MPTCP_MPC_ACK_DATA_CSUM) { mp_opt->csum = get_unaligned((__force __sum16 *)ptr); mp_opt->suboptions |= OPTION_MPTCP_CSUMREQD; ptr += 2; } pr_debug("MP_CAPABLE version=%x, flags=%x, optlen=%d sndr=%llu, rcvr=%llu len=%d csum=%u\n", version, flags, opsize, mp_opt->sndr_key, mp_opt->rcvr_key, mp_opt->data_len, mp_opt->csum); break; case MPTCPOPT_MP_JOIN: if (opsize == TCPOLEN_MPTCP_MPJ_SYN) { mp_opt->suboptions |= OPTION_MPTCP_MPJ_SYN; mp_opt->backup = *ptr++ & MPTCPOPT_BACKUP; mp_opt->join_id = *ptr++; mp_opt->token = get_unaligned_be32(ptr); ptr += 4; mp_opt->nonce = get_unaligned_be32(ptr); ptr += 4; pr_debug("MP_JOIN bkup=%u, id=%u, token=%u, nonce=%u\n", mp_opt->backup, mp_opt->join_id, mp_opt->token, mp_opt->nonce); } else if (opsize == TCPOLEN_MPTCP_MPJ_SYNACK) { mp_opt->suboptions |= OPTION_MPTCP_MPJ_SYNACK; mp_opt->backup = *ptr++ & MPTCPOPT_BACKUP; mp_opt->join_id = *ptr++; mp_opt->thmac = get_unaligned_be64(ptr); ptr += 8; mp_opt->nonce = get_unaligned_be32(ptr); ptr += 4; pr_debug("MP_JOIN bkup=%u, id=%u, thmac=%llu, nonce=%u\n", mp_opt->backup, mp_opt->join_id, mp_opt->thmac, mp_opt->nonce); } else if (opsize == TCPOLEN_MPTCP_MPJ_ACK) { mp_opt->suboptions |= OPTION_MPTCP_MPJ_ACK; ptr += 2; memcpy(mp_opt->hmac, ptr, MPTCPOPT_HMAC_LEN); pr_debug("MP_JOIN hmac\n"); } break; case MPTCPOPT_DSS: pr_debug("DSS\n"); ptr++; flags = (*ptr++) & MPTCP_DSS_FLAG_MASK; mp_opt->data_fin = (flags & MPTCP_DSS_DATA_FIN) != 0; mp_opt->dsn64 = (flags & MPTCP_DSS_DSN64) != 0; mp_opt->use_map = (flags & MPTCP_DSS_HAS_MAP) != 0; mp_opt->ack64 = (flags & MPTCP_DSS_ACK64) != 0; mp_opt->use_ack = (flags & MPTCP_DSS_HAS_ACK); pr_debug("data_fin=%d dsn64=%d use_map=%d ack64=%d use_ack=%d\n", mp_opt->data_fin, mp_opt->dsn64, mp_opt->use_map, mp_opt->ack64, mp_opt->use_ack); expected_opsize = TCPOLEN_MPTCP_DSS_BASE; if (mp_opt->use_ack) { if (mp_opt->ack64) expected_opsize += TCPOLEN_MPTCP_DSS_ACK64; else expected_opsize += TCPOLEN_MPTCP_DSS_ACK32; } if (mp_opt->use_map) { if (mp_opt->dsn64) expected_opsize += TCPOLEN_MPTCP_DSS_MAP64; else expected_opsize += TCPOLEN_MPTCP_DSS_MAP32; } /* Always parse any csum presence combination, we will enforce * RFC 8684 Section 3.3.0 checks later in subflow_data_ready */ if (opsize != expected_opsize && opsize != expected_opsize + TCPOLEN_MPTCP_DSS_CHECKSUM) break; mp_opt->suboptions |= OPTION_MPTCP_DSS; if (mp_opt->use_ack) { if (mp_opt->ack64) { mp_opt->data_ack = get_unaligned_be64(ptr); ptr += 8; } else { mp_opt->data_ack = get_unaligned_be32(ptr); ptr += 4; } pr_debug("data_ack=%llu\n", mp_opt->data_ack); } if (mp_opt->use_map) { if (mp_opt->dsn64) { mp_opt->data_seq = get_unaligned_be64(ptr); ptr += 8; } else { mp_opt->data_seq = get_unaligned_be32(ptr); ptr += 4; } mp_opt->subflow_seq = get_unaligned_be32(ptr); ptr += 4; mp_opt->data_len = get_unaligned_be16(ptr); ptr += 2; if (opsize == expected_opsize + TCPOLEN_MPTCP_DSS_CHECKSUM) { mp_opt->suboptions |= OPTION_MPTCP_CSUMREQD; mp_opt->csum = get_unaligned((__force __sum16 *)ptr); ptr += 2; } pr_debug("data_seq=%llu subflow_seq=%u data_len=%u csum=%d:%u\n", mp_opt->data_seq, mp_opt->subflow_seq, mp_opt->data_len, !!(mp_opt->suboptions & OPTION_MPTCP_CSUMREQD), mp_opt->csum); } break; case MPTCPOPT_ADD_ADDR: mp_opt->echo = (*ptr++) & MPTCP_ADDR_ECHO; if (!mp_opt->echo) { if (opsize == TCPOLEN_MPTCP_ADD_ADDR || opsize == TCPOLEN_MPTCP_ADD_ADDR_PORT) mp_opt->addr.family = AF_INET; #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (opsize == TCPOLEN_MPTCP_ADD_ADDR6 || opsize == TCPOLEN_MPTCP_ADD_ADDR6_PORT) mp_opt->addr.family = AF_INET6; #endif else break; } else { if (opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE || opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT) mp_opt->addr.family = AF_INET; #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE || opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT) mp_opt->addr.family = AF_INET6; #endif else break; } mp_opt->suboptions |= OPTION_MPTCP_ADD_ADDR; mp_opt->addr.id = *ptr++; mp_opt->addr.port = 0; mp_opt->ahmac = 0; if (mp_opt->addr.family == AF_INET) { memcpy((u8 *)&mp_opt->addr.addr.s_addr, (u8 *)ptr, 4); ptr += 4; if (opsize == TCPOLEN_MPTCP_ADD_ADDR_PORT || opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT) { mp_opt->addr.port = htons(get_unaligned_be16(ptr)); ptr += 2; } } #if IS_ENABLED(CONFIG_MPTCP_IPV6) else { memcpy(mp_opt->addr.addr6.s6_addr, (u8 *)ptr, 16); ptr += 16; if (opsize == TCPOLEN_MPTCP_ADD_ADDR6_PORT || opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT) { mp_opt->addr.port = htons(get_unaligned_be16(ptr)); ptr += 2; } } #endif if (!mp_opt->echo) { mp_opt->ahmac = get_unaligned_be64(ptr); ptr += 8; } pr_debug("ADD_ADDR%s: id=%d, ahmac=%llu, echo=%d, port=%d\n", (mp_opt->addr.family == AF_INET6) ? "6" : "", mp_opt->addr.id, mp_opt->ahmac, mp_opt->echo, ntohs(mp_opt->addr.port)); break; case MPTCPOPT_RM_ADDR: if (opsize < TCPOLEN_MPTCP_RM_ADDR_BASE + 1 || opsize > TCPOLEN_MPTCP_RM_ADDR_BASE + MPTCP_RM_IDS_MAX) break; ptr++; mp_opt->suboptions |= OPTION_MPTCP_RM_ADDR; mp_opt->rm_list.nr = opsize - TCPOLEN_MPTCP_RM_ADDR_BASE; for (i = 0; i < mp_opt->rm_list.nr; i++) mp_opt->rm_list.ids[i] = *ptr++; pr_debug("RM_ADDR: rm_list_nr=%d\n", mp_opt->rm_list.nr); break; case MPTCPOPT_MP_PRIO: if (opsize != TCPOLEN_MPTCP_PRIO) break; mp_opt->suboptions |= OPTION_MPTCP_PRIO; mp_opt->backup = *ptr++ & MPTCP_PRIO_BKUP; pr_debug("MP_PRIO: prio=%d\n", mp_opt->backup); break; case MPTCPOPT_MP_FASTCLOSE: if (opsize != TCPOLEN_MPTCP_FASTCLOSE) break; ptr += 2; mp_opt->rcvr_key = get_unaligned_be64(ptr); ptr += 8; mp_opt->suboptions |= OPTION_MPTCP_FASTCLOSE; pr_debug("MP_FASTCLOSE: recv_key=%llu\n", mp_opt->rcvr_key); break; case MPTCPOPT_RST: if (opsize != TCPOLEN_MPTCP_RST) break; if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) break; mp_opt->suboptions |= OPTION_MPTCP_RST; flags = *ptr++; mp_opt->reset_transient = flags & MPTCP_RST_TRANSIENT; mp_opt->reset_reason = *ptr; pr_debug("MP_RST: transient=%u reason=%u\n", mp_opt->reset_transient, mp_opt->reset_reason); break; case MPTCPOPT_MP_FAIL: if (opsize != TCPOLEN_MPTCP_FAIL) break; ptr += 2; mp_opt->suboptions |= OPTION_MPTCP_FAIL; mp_opt->fail_seq = get_unaligned_be64(ptr); pr_debug("MP_FAIL: data_seq=%llu\n", mp_opt->fail_seq); break; default: break; } } void mptcp_get_options(const struct sk_buff *skb, struct mptcp_options_received *mp_opt) { const struct tcphdr *th = tcp_hdr(skb); const unsigned char *ptr; int length; /* Ensure that casting the whole status to u32 is efficient and safe */ BUILD_BUG_ON(sizeof_field(struct mptcp_options_received, status) != sizeof(u32)); BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct mptcp_options_received, status), sizeof(u32))); *(u32 *)&mp_opt->status = 0; length = (th->doff * 4) - sizeof(struct tcphdr); ptr = (const unsigned char *)(th + 1); while (length > 0) { int opcode = *ptr++; int opsize; switch (opcode) { case TCPOPT_EOL: return; case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ length--; continue; default: if (length < 2) return; opsize = *ptr++; if (opsize < 2) /* "silly options" */ return; if (opsize > length) return; /* don't parse partial options */ if (opcode == TCPOPT_MPTCP) mptcp_parse_option(skb, ptr, opsize, mp_opt); ptr += opsize - 2; length -= opsize; } } } bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, unsigned int *size, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); /* we will use snd_isn to detect first pkt [re]transmission * in mptcp_established_options_mp() */ subflow->snd_isn = TCP_SKB_CB(skb)->end_seq; if (subflow->request_mptcp) { if (unlikely(subflow_simultaneous_connect(sk))) { WARN_ON_ONCE(!mptcp_try_fallback(sk, MPTCP_MIB_SIMULTCONNFALLBACK)); /* Ensure mptcp_finish_connect() will not process the * MPC handshake. */ subflow->request_mptcp = 0; return false; } opts->suboptions = OPTION_MPTCP_MPC_SYN; opts->csum_reqd = mptcp_is_checksum_enabled(sock_net(sk)); opts->allow_join_id0 = mptcp_allow_join_id0(sock_net(sk)); *size = TCPOLEN_MPTCP_MPC_SYN; return true; } else if (subflow->request_join) { pr_debug("remote_token=%u, nonce=%u\n", subflow->remote_token, subflow->local_nonce); opts->suboptions = OPTION_MPTCP_MPJ_SYN; opts->join_id = subflow->local_id; opts->token = subflow->remote_token; opts->nonce = subflow->local_nonce; opts->backup = subflow->request_bkup; *size = TCPOLEN_MPTCP_MPJ_SYN; return true; } return false; } static void clear_3rdack_retransmission(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); sk_stop_timer(sk, &icsk->icsk_delack_timer); icsk->icsk_ack.ato = 0; icsk->icsk_ack.pending &= ~(ICSK_ACK_SCHED | ICSK_ACK_TIMER); } static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, bool snd_data_fin_enable, unsigned int *size, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); struct mptcp_ext *mpext; unsigned int data_len; u8 len; /* When skb is not available, we better over-estimate the emitted * options len. A full DSS option (28 bytes) is longer than * TCPOLEN_MPTCP_MPC_ACK_DATA(22) or TCPOLEN_MPTCP_MPJ_ACK(24), so * tell the caller to defer the estimate to * mptcp_established_options_dss(), which will reserve enough space. */ if (!skb) return false; /* MPC/MPJ needed only on 3rd ack packet, DATA_FIN and TCP shutdown take precedence */ if (READ_ONCE(subflow->fully_established) || snd_data_fin_enable || subflow->snd_isn != TCP_SKB_CB(skb)->seq || sk->sk_state != TCP_ESTABLISHED) return false; if (subflow->mp_capable) { mpext = mptcp_get_ext(skb); data_len = mpext ? mpext->data_len : 0; /* we will check ops->data_len in mptcp_write_options() to * discriminate between TCPOLEN_MPTCP_MPC_ACK_DATA and * TCPOLEN_MPTCP_MPC_ACK */ opts->data_len = data_len; opts->suboptions = OPTION_MPTCP_MPC_ACK; opts->sndr_key = subflow->local_key; opts->rcvr_key = subflow->remote_key; opts->csum_reqd = READ_ONCE(msk->csum_enabled); opts->allow_join_id0 = mptcp_allow_join_id0(sock_net(sk)); /* Section 3.1. * The MP_CAPABLE option is carried on the SYN, SYN/ACK, and ACK * packets that start the first subflow of an MPTCP connection, * as well as the first packet that carries data */ if (data_len > 0) { len = TCPOLEN_MPTCP_MPC_ACK_DATA; if (opts->csum_reqd) { /* we need to propagate more info to csum the pseudo hdr */ opts->data_seq = mpext->data_seq; opts->subflow_seq = mpext->subflow_seq; opts->csum = mpext->csum; len += TCPOLEN_MPTCP_DSS_CHECKSUM; } *size = ALIGN(len, 4); } else { *size = TCPOLEN_MPTCP_MPC_ACK; } pr_debug("subflow=%p, local_key=%llu, remote_key=%llu map_len=%d\n", subflow, subflow->local_key, subflow->remote_key, data_len); return true; } else if (subflow->mp_join) { opts->suboptions = OPTION_MPTCP_MPJ_ACK; memcpy(opts->hmac, subflow->hmac, MPTCPOPT_HMAC_LEN); *size = TCPOLEN_MPTCP_MPJ_ACK; pr_debug("subflow=%p\n", subflow); /* we can use the full delegate action helper only from BH context * If we are in process context - sk is flushing the backlog at * socket lock release time - just set the appropriate flag, will * be handled by the release callback */ if (sock_owned_by_user(sk)) set_bit(MPTCP_DELEGATE_ACK, &subflow->delegated_status); else mptcp_subflow_delegate(subflow, MPTCP_DELEGATE_ACK); return true; } return false; } static void mptcp_write_data_fin(struct mptcp_subflow_context *subflow, struct sk_buff *skb, struct mptcp_ext *ext) { /* The write_seq value has already been incremented, so the actual * sequence number for the DATA_FIN is one less. */ u64 data_fin_tx_seq = READ_ONCE(mptcp_sk(subflow->conn)->write_seq) - 1; if (!ext->use_map || !skb->len) { /* RFC6824 requires a DSS mapping with specific values * if DATA_FIN is set but no data payload is mapped */ ext->data_fin = 1; ext->use_map = 1; ext->dsn64 = 1; ext->data_seq = data_fin_tx_seq; ext->subflow_seq = 0; ext->data_len = 1; } else if (ext->data_seq + ext->data_len == data_fin_tx_seq) { /* If there's an existing DSS mapping and it is the * final mapping, DATA_FIN consumes 1 additional byte of * mapping space. */ ext->data_fin = 1; ext->data_len++; } } static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, bool snd_data_fin_enable, unsigned int *size, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); unsigned int dss_size = 0; struct mptcp_ext *mpext; unsigned int ack_size; bool ret = false; u64 ack_seq; opts->csum_reqd = READ_ONCE(msk->csum_enabled); mpext = skb ? mptcp_get_ext(skb) : NULL; if (!skb || (mpext && mpext->use_map) || snd_data_fin_enable) { unsigned int map_size = TCPOLEN_MPTCP_DSS_BASE + TCPOLEN_MPTCP_DSS_MAP64; if (mpext) { if (opts->csum_reqd) map_size += TCPOLEN_MPTCP_DSS_CHECKSUM; opts->ext_copy = *mpext; } dss_size = map_size; if (skb && snd_data_fin_enable) mptcp_write_data_fin(subflow, skb, &opts->ext_copy); opts->suboptions = OPTION_MPTCP_DSS; ret = true; } /* passive sockets msk will set the 'can_ack' after accept(), even * if the first subflow may have the already the remote key handy */ opts->ext_copy.use_ack = 0; if (!READ_ONCE(msk->can_ack)) { *size = ALIGN(dss_size, 4); return ret; } ack_seq = READ_ONCE(msk->ack_seq); if (READ_ONCE(msk->use_64bit_ack)) { ack_size = TCPOLEN_MPTCP_DSS_ACK64; opts->ext_copy.data_ack = ack_seq; opts->ext_copy.ack64 = 1; } else { ack_size = TCPOLEN_MPTCP_DSS_ACK32; opts->ext_copy.data_ack32 = (uint32_t)ack_seq; opts->ext_copy.ack64 = 0; } opts->ext_copy.use_ack = 1; opts->suboptions = OPTION_MPTCP_DSS; /* Add kind/length/subtype/flag overhead if mapping is not populated */ if (dss_size == 0) ack_size += TCPOLEN_MPTCP_DSS_BASE; dss_size += ack_size; *size = ALIGN(dss_size, 4); return true; } static u64 add_addr_generate_hmac(u64 key1, u64 key2, struct mptcp_addr_info *addr) { u16 port = ntohs(addr->port); u8 hmac[SHA256_DIGEST_SIZE]; u8 msg[19]; int i = 0; msg[i++] = addr->id; if (addr->family == AF_INET) { memcpy(&msg[i], &addr->addr.s_addr, 4); i += 4; } #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (addr->family == AF_INET6) { memcpy(&msg[i], &addr->addr6.s6_addr, 16); i += 16; } #endif msg[i++] = port >> 8; msg[i++] = port & 0xFF; mptcp_crypto_hmac_sha(key1, key2, msg, i, hmac); return get_unaligned_be64(&hmac[SHA256_DIGEST_SIZE - sizeof(u64)]); } static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff *skb, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); bool drop_other_suboptions = false; unsigned int opt_size = *size; struct mptcp_addr_info addr; bool echo; int len; /* add addr will strip the existing options, be sure to avoid breaking * MPC/MPJ handshakes */ if (!mptcp_pm_should_add_signal(msk) || (opts->suboptions & (OPTION_MPTCP_MPJ_ACK | OPTION_MPTCP_MPC_ACK)) || !mptcp_pm_add_addr_signal(msk, skb, opt_size, remaining, &addr, &echo, &drop_other_suboptions)) return false; /* * Later on, mptcp_write_options() will enforce mutually exclusion with * DSS, bail out if such option is set and we can't drop it. */ if (drop_other_suboptions) remaining += opt_size; else if (opts->suboptions & OPTION_MPTCP_DSS) return false; len = mptcp_add_addr_len(addr.family, echo, !!addr.port); if (remaining < len) return false; *size = len; if (drop_other_suboptions) { pr_debug("drop other suboptions\n"); opts->suboptions = 0; /* note that e.g. DSS could have written into the memory * aliased by ahmac, we must reset the field here * to avoid appending the hmac even for ADD_ADDR echo * options */ opts->ahmac = 0; *size -= opt_size; } opts->addr = addr; opts->suboptions |= OPTION_MPTCP_ADD_ADDR; if (!echo) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ADDADDRTX); opts->ahmac = add_addr_generate_hmac(READ_ONCE(msk->local_key), READ_ONCE(msk->remote_key), &opts->addr); } else { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ECHOADDTX); } pr_debug("addr_id=%d, ahmac=%llu, echo=%d, port=%d\n", opts->addr.id, opts->ahmac, echo, ntohs(opts->addr.port)); return true; } static bool mptcp_established_options_rm_addr(struct sock *sk, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); struct mptcp_rm_list rm_list; int i, len; if (!mptcp_pm_should_rm_signal(msk) || !(mptcp_pm_rm_addr_signal(msk, remaining, &rm_list))) return false; len = mptcp_rm_addr_len(&rm_list); if (len < 0) return false; if (remaining < len) return false; *size = len; opts->suboptions |= OPTION_MPTCP_RM_ADDR; opts->rm_list = rm_list; for (i = 0; i < opts->rm_list.nr; i++) pr_debug("rm_list_ids[%d]=%d\n", i, opts->rm_list.ids[i]); MPTCP_ADD_STATS(sock_net(sk), MPTCP_MIB_RMADDRTX, opts->rm_list.nr); return true; } static bool mptcp_established_options_mp_prio(struct sock *sk, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); /* can't send MP_PRIO with MPC, as they share the same option space: * 'backup'. Also it makes no sense at all */ if (!subflow->send_mp_prio || (opts->suboptions & OPTIONS_MPTCP_MPC)) return false; /* account for the trailing 'nop' option */ if (remaining < TCPOLEN_MPTCP_PRIO_ALIGN) return false; *size = TCPOLEN_MPTCP_PRIO_ALIGN; opts->suboptions |= OPTION_MPTCP_PRIO; opts->backup = subflow->request_bkup; pr_debug("prio=%d\n", opts->backup); return true; } static noinline bool mptcp_established_options_rst(struct sock *sk, struct sk_buff *skb, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); if (remaining < TCPOLEN_MPTCP_RST) return false; *size = TCPOLEN_MPTCP_RST; opts->suboptions |= OPTION_MPTCP_RST; opts->reset_transient = subflow->reset_transient; opts->reset_reason = subflow->reset_reason; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPRSTTX); return true; } static bool mptcp_established_options_fastclose(struct sock *sk, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); if (likely(!subflow->send_fastclose)) return false; if (remaining < TCPOLEN_MPTCP_FASTCLOSE) return false; *size = TCPOLEN_MPTCP_FASTCLOSE; opts->suboptions |= OPTION_MPTCP_FASTCLOSE; opts->rcvr_key = READ_ONCE(msk->remote_key); pr_debug("FASTCLOSE key=%llu\n", opts->rcvr_key); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFASTCLOSETX); return true; } static bool mptcp_established_options_mp_fail(struct sock *sk, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); if (likely(!subflow->send_mp_fail)) return false; if (remaining < TCPOLEN_MPTCP_FAIL) return false; *size = TCPOLEN_MPTCP_FAIL; opts->suboptions |= OPTION_MPTCP_FAIL; opts->fail_seq = subflow->map_seq; pr_debug("MP_FAIL fail_seq=%llu\n", opts->fail_seq); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFAILTX); return true; } bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); unsigned int opt_size = 0; bool snd_data_fin; bool ret = false; opts->suboptions = 0; /* Force later mptcp_write_options(), but do not use any actual * option space. */ if (unlikely(__mptcp_check_fallback(msk) && !mptcp_check_infinite_map(skb))) return true; if (unlikely(skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) { if (mptcp_established_options_fastclose(sk, &opt_size, remaining, opts) || mptcp_established_options_mp_fail(sk, &opt_size, remaining, opts)) { *size += opt_size; remaining -= opt_size; } /* MP_RST can be used with MP_FASTCLOSE and MP_FAIL if there is room */ if (mptcp_established_options_rst(sk, skb, &opt_size, remaining, opts)) { *size += opt_size; remaining -= opt_size; } return true; } snd_data_fin = mptcp_data_fin_enabled(msk); if (mptcp_established_options_mp(sk, skb, snd_data_fin, &opt_size, opts)) ret = true; else if (mptcp_established_options_dss(sk, skb, snd_data_fin, &opt_size, opts)) { unsigned int mp_fail_size; ret = true; if (mptcp_established_options_mp_fail(sk, &mp_fail_size, remaining - opt_size, opts)) { *size += opt_size + mp_fail_size; remaining -= opt_size - mp_fail_size; return true; } } /* we reserved enough space for the above options, and exceeding the * TCP option space would be fatal */ if (WARN_ON_ONCE(opt_size > remaining)) return false; *size += opt_size; remaining -= opt_size; if (mptcp_established_options_add_addr(sk, skb, &opt_size, remaining, opts)) { *size += opt_size; remaining -= opt_size; ret = true; } else if (mptcp_established_options_rm_addr(sk, &opt_size, remaining, opts)) { *size += opt_size; remaining -= opt_size; ret = true; } if (mptcp_established_options_mp_prio(sk, &opt_size, remaining, opts)) { *size += opt_size; remaining -= opt_size; ret = true; } return ret; } bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, struct mptcp_out_options *opts) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); if (subflow_req->mp_capable) { opts->suboptions = OPTION_MPTCP_MPC_SYNACK; opts->sndr_key = subflow_req->local_key; opts->csum_reqd = subflow_req->csum_reqd; opts->allow_join_id0 = subflow_req->allow_join_id0; *size = TCPOLEN_MPTCP_MPC_SYNACK; pr_debug("subflow_req=%p, local_key=%llu\n", subflow_req, subflow_req->local_key); return true; } else if (subflow_req->mp_join) { opts->suboptions = OPTION_MPTCP_MPJ_SYNACK; opts->backup = subflow_req->request_bkup; opts->join_id = subflow_req->local_id; opts->thmac = subflow_req->thmac; opts->nonce = subflow_req->local_nonce; pr_debug("req=%p, bkup=%u, id=%u, thmac=%llu, nonce=%u\n", subflow_req, opts->backup, opts->join_id, opts->thmac, opts->nonce); *size = TCPOLEN_MPTCP_MPJ_SYNACK; return true; } return false; } static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk, struct mptcp_subflow_context *subflow, struct sk_buff *skb, struct mptcp_options_received *mp_opt) { /* here we can process OoO, in-window pkts, only in-sequence 4th ack * will make the subflow fully established */ if (likely(READ_ONCE(subflow->fully_established))) { /* on passive sockets, check for 3rd ack retransmission * note that msk is always set by subflow_syn_recv_sock() * for mp_join subflows */ if (TCP_SKB_CB(skb)->seq == subflow->ssn_offset + 1 && TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq && subflow->mp_join && (mp_opt->suboptions & OPTIONS_MPTCP_MPJ) && !subflow->request_join) tcp_send_ack(ssk); goto check_notify; } /* we must process OoO packets before the first subflow is fully * established. OoO packets are instead a protocol violation * for MP_JOIN subflows as the peer must not send any data * before receiving the forth ack - cfr. RFC 8684 section 3.2. */ if (TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1) { if (subflow->mp_join) goto reset; if (subflow->is_mptfo && mp_opt->suboptions & OPTION_MPTCP_MPC_ACK) goto set_fully_established; return subflow->mp_capable; } if (subflow->remote_key_valid && (((mp_opt->suboptions & OPTION_MPTCP_DSS) && mp_opt->use_ack) || ((mp_opt->suboptions & OPTION_MPTCP_ADD_ADDR) && (!mp_opt->echo || subflow->mp_join)))) { /* subflows are fully established as soon as we get any * additional ack, including ADD_ADDR. */ goto set_fully_established; } /* If the first established packet does not contain MP_CAPABLE + data * then fallback to TCP. Fallback scenarios requires a reset for * MP_JOIN subflows. */ if (!(mp_opt->suboptions & OPTIONS_MPTCP_MPC)) { if (subflow->mp_join) goto reset; subflow->mp_capable = 0; if (!mptcp_try_fallback(ssk, MPTCP_MIB_MPCAPABLEDATAFALLBACK)) { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_FALLBACKFAILED); goto reset; } return false; } if (unlikely(!READ_ONCE(msk->pm.server_side))) pr_warn_once("bogus mpc option on established client sk"); set_fully_established: if (mp_opt->deny_join_id0) WRITE_ONCE(msk->pm.remote_deny_join_id0, true); mptcp_data_lock((struct sock *)msk); __mptcp_subflow_fully_established(msk, subflow, mp_opt); mptcp_data_unlock((struct sock *)msk); check_notify: /* if the subflow is not already linked into the conn_list, we can't * notify the PM: this subflow is still on the listener queue * and the PM possibly acquiring the subflow lock could race with * the listener close */ if (likely(subflow->pm_notified) || list_empty(&subflow->node)) return true; subflow->pm_notified = 1; if (subflow->mp_join) { clear_3rdack_retransmission(ssk); mptcp_pm_subflow_established(msk); } else { mptcp_pm_fully_established(msk, ssk); } return true; reset: mptcp_subflow_reset(ssk); return false; } u64 __mptcp_expand_seq(u64 old_seq, u64 cur_seq) { u32 old_seq32, cur_seq32; old_seq32 = (u32)old_seq; cur_seq32 = (u32)cur_seq; cur_seq = (old_seq & GENMASK_ULL(63, 32)) + cur_seq32; if (unlikely(cur_seq32 < old_seq32 && before(old_seq32, cur_seq32))) return cur_seq + (1LL << 32); /* reverse wrap could happen, too */ if (unlikely(cur_seq32 > old_seq32 && after(old_seq32, cur_seq32))) return cur_seq - (1LL << 32); return cur_seq; } static void __mptcp_snd_una_update(struct mptcp_sock *msk, u64 new_snd_una) { msk->bytes_acked += new_snd_una - msk->snd_una; WRITE_ONCE(msk->snd_una, new_snd_una); } static void rwin_update(struct mptcp_sock *msk, struct sock *ssk, struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct tcp_sock *tp = tcp_sk(ssk); u64 mptcp_rcv_wnd; /* Avoid touching extra cachelines if TCP is going to accept this * skb without filling the TCP-level window even with a possibly * outdated mptcp-level rwin. */ if (!skb->len || skb->len < tcp_receive_window(tp)) return; mptcp_rcv_wnd = atomic64_read(&msk->rcv_wnd_sent); if (!after64(mptcp_rcv_wnd, subflow->rcv_wnd_sent)) return; /* Some other subflow grew the mptcp-level rwin since rcv_wup, * resync. */ tp->rcv_wnd += mptcp_rcv_wnd - subflow->rcv_wnd_sent; subflow->rcv_wnd_sent = mptcp_rcv_wnd; } static void ack_update_msk(struct mptcp_sock *msk, struct sock *ssk, struct mptcp_options_received *mp_opt) { u64 new_wnd_end, new_snd_una, snd_nxt = READ_ONCE(msk->snd_nxt); struct sock *sk = (struct sock *)msk; u64 old_snd_una; mptcp_data_lock(sk); /* avoid ack expansion on update conflict, to reduce the risk of * wrongly expanding to a future ack sequence number, which is way * more dangerous than missing an ack */ old_snd_una = msk->snd_una; new_snd_una = mptcp_expand_seq(old_snd_una, mp_opt->data_ack, mp_opt->ack64); /* ACK for data not even sent yet? Ignore.*/ if (unlikely(after64(new_snd_una, snd_nxt))) new_snd_una = old_snd_una; new_wnd_end = new_snd_una + tcp_sk(ssk)->snd_wnd; if (after64(new_wnd_end, msk->wnd_end)) WRITE_ONCE(msk->wnd_end, new_wnd_end); /* this assumes mptcp_incoming_options() is invoked after tcp_ack() */ if (after64(msk->wnd_end, snd_nxt)) __mptcp_check_push(sk, ssk); if (after64(new_snd_una, old_snd_una)) { __mptcp_snd_una_update(msk, new_snd_una); __mptcp_data_acked(sk); } msk->last_ack_recv = tcp_jiffies32; mptcp_data_unlock(sk); trace_ack_update_msk(mp_opt->data_ack, old_snd_una, new_snd_una, new_wnd_end, READ_ONCE(msk->wnd_end)); } bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit) { /* Skip if DATA_FIN was already received. * If updating simultaneously with the recvmsg loop, values * should match. If they mismatch, the peer is misbehaving and * we will prefer the most recent information. */ if (READ_ONCE(msk->rcv_data_fin)) return false; WRITE_ONCE(msk->rcv_data_fin_seq, mptcp_expand_seq(READ_ONCE(msk->ack_seq), data_fin_seq, use_64bit)); WRITE_ONCE(msk->rcv_data_fin, 1); return true; } static bool add_addr_hmac_valid(struct mptcp_sock *msk, struct mptcp_options_received *mp_opt) { u64 hmac = 0; if (mp_opt->echo) return true; hmac = add_addr_generate_hmac(READ_ONCE(msk->remote_key), READ_ONCE(msk->local_key), &mp_opt->addr); pr_debug("msk=%p, ahmac=%llu, mp_opt->ahmac=%llu\n", msk, hmac, mp_opt->ahmac); return hmac == mp_opt->ahmac; } /* Return false in case of error (or subflow has been reset), * else return true. */ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); struct mptcp_options_received mp_opt; struct mptcp_ext *mpext; if (__mptcp_check_fallback(msk)) { /* Keep it simple and unconditionally trigger send data cleanup and * pending queue spooling. We will need to acquire the data lock * for more accurate checks, and once the lock is acquired, such * helpers are cheap. */ mptcp_data_lock(subflow->conn); if (sk_stream_memory_free(sk)) __mptcp_check_push(subflow->conn, sk); /* on fallback we just need to ignore the msk-level snd_una, as * this is really plain TCP */ __mptcp_snd_una_update(msk, READ_ONCE(msk->snd_nxt)); __mptcp_data_acked(subflow->conn); mptcp_data_unlock(subflow->conn); return true; } mptcp_get_options(skb, &mp_opt); /* The subflow can be in close state only if check_fully_established() * just sent a reset. If so, tell the caller to ignore the current packet. */ if (!check_fully_established(msk, sk, subflow, skb, &mp_opt)) return sk->sk_state != TCP_CLOSE; if (unlikely(mp_opt.suboptions != OPTION_MPTCP_DSS)) { if ((mp_opt.suboptions & OPTION_MPTCP_FASTCLOSE) && READ_ONCE(msk->local_key) == mp_opt.rcvr_key) { WRITE_ONCE(msk->rcv_fastclose, true); mptcp_schedule_work((struct sock *)msk); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFASTCLOSERX); } if ((mp_opt.suboptions & OPTION_MPTCP_ADD_ADDR) && add_addr_hmac_valid(msk, &mp_opt)) { if (!mp_opt.echo) { mptcp_pm_add_addr_received(sk, &mp_opt.addr); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ADDADDR); } else { mptcp_pm_add_addr_echoed(msk, &mp_opt.addr); mptcp_pm_del_add_timer(msk, &mp_opt.addr, true); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ECHOADD); } if (mp_opt.addr.port) MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_PORTADD); } if (mp_opt.suboptions & OPTION_MPTCP_RM_ADDR) mptcp_pm_rm_addr_received(msk, &mp_opt.rm_list); if (mp_opt.suboptions & OPTION_MPTCP_PRIO) { mptcp_pm_mp_prio_received(sk, mp_opt.backup); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPPRIORX); } if (mp_opt.suboptions & OPTION_MPTCP_FAIL) { mptcp_pm_mp_fail_received(sk, mp_opt.fail_seq); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFAILRX); } if (mp_opt.suboptions & OPTION_MPTCP_RST) { subflow->reset_seen = 1; subflow->reset_reason = mp_opt.reset_reason; subflow->reset_transient = mp_opt.reset_transient; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPRSTRX); } if (!(mp_opt.suboptions & OPTION_MPTCP_DSS)) return true; } /* we can't wait for recvmsg() to update the ack_seq, otherwise * monodirectional flows will stuck */ if (mp_opt.use_ack) ack_update_msk(msk, sk, &mp_opt); rwin_update(msk, sk, skb); /* Zero-data-length packets are dropped by the caller and not * propagated to the MPTCP layer, so the skb extension does not * need to be allocated or populated. DATA_FIN information, if * present, needs to be updated here before the skb is freed. */ if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { if (mp_opt.data_fin && mp_opt.data_len == 1 && mptcp_update_rcv_data_fin(msk, mp_opt.data_seq, mp_opt.dsn64)) mptcp_schedule_work((struct sock *)msk); return true; } mpext = skb_ext_add(skb, SKB_EXT_MPTCP); if (!mpext) return false; memset(mpext, 0, sizeof(*mpext)); if (likely(mp_opt.use_map)) { if (mp_opt.mpc_map) { /* this is an MP_CAPABLE carrying MPTCP data * we know this map the first chunk of data */ mptcp_crypto_key_sha(subflow->remote_key, NULL, &mpext->data_seq); mpext->data_seq++; mpext->subflow_seq = 1; mpext->dsn64 = 1; mpext->mpc_map = 1; mpext->data_fin = 0; } else { mpext->data_seq = mp_opt.data_seq; mpext->subflow_seq = mp_opt.subflow_seq; mpext->dsn64 = mp_opt.dsn64; mpext->data_fin = mp_opt.data_fin; } mpext->data_len = mp_opt.data_len; mpext->use_map = 1; mpext->csum_reqd = !!(mp_opt.suboptions & OPTION_MPTCP_CSUMREQD); if (mpext->csum_reqd) mpext->csum = mp_opt.csum; } return true; } static void mptcp_set_rwin(struct tcp_sock *tp, struct tcphdr *th) { const struct sock *ssk = (const struct sock *)tp; struct mptcp_subflow_context *subflow; u64 ack_seq, rcv_wnd_old, rcv_wnd_new; struct mptcp_sock *msk; u32 new_win; u64 win; subflow = mptcp_subflow_ctx(ssk); msk = mptcp_sk(subflow->conn); ack_seq = READ_ONCE(msk->ack_seq); rcv_wnd_new = ack_seq + tp->rcv_wnd; rcv_wnd_old = atomic64_read(&msk->rcv_wnd_sent); if (after64(rcv_wnd_new, rcv_wnd_old)) { u64 rcv_wnd; for (;;) { rcv_wnd = atomic64_cmpxchg(&msk->rcv_wnd_sent, rcv_wnd_old, rcv_wnd_new); if (rcv_wnd == rcv_wnd_old) break; rcv_wnd_old = rcv_wnd; if (before64(rcv_wnd_new, rcv_wnd_old)) { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDCONFLICTUPDATE); goto raise_win; } MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDCONFLICT); } goto update_wspace; } if (rcv_wnd_new != rcv_wnd_old) { raise_win: /* The msk-level rcv wnd is after the tcp level one, * sync the latter. */ rcv_wnd_new = rcv_wnd_old; win = rcv_wnd_old - ack_seq; tp->rcv_wnd = min_t(u64, win, U32_MAX); new_win = tp->rcv_wnd; /* Make sure we do not exceed the maximum possible * scaled window. */ if (unlikely(th->syn)) new_win = min(new_win, 65535U) << tp->rx_opt.rcv_wscale; if (!tp->rx_opt.rcv_wscale && READ_ONCE(sock_net(ssk)->ipv4.sysctl_tcp_workaround_signed_windows)) new_win = min(new_win, MAX_TCP_WINDOW); else new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale)); /* RFC1323 scaling applied */ new_win >>= tp->rx_opt.rcv_wscale; th->window = htons(new_win); MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDSHARED); } update_wspace: WRITE_ONCE(msk->old_wspace, tp->rcv_wnd); subflow->rcv_wnd_sent = rcv_wnd_new; } static void mptcp_track_rwin(struct tcp_sock *tp) { const struct sock *ssk = (const struct sock *)tp; struct mptcp_subflow_context *subflow; struct mptcp_sock *msk; if (!ssk) return; subflow = mptcp_subflow_ctx(ssk); msk = mptcp_sk(subflow->conn); WRITE_ONCE(msk->old_wspace, tp->rcv_wnd); } __sum16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum) { struct csum_pseudo_header header; __wsum csum; /* cfr RFC 8684 3.3.1.: * the data sequence number used in the pseudo-header is * always the 64-bit value, irrespective of what length is used in the * DSS option itself. */ header.data_seq = cpu_to_be64(data_seq); header.subflow_seq = htonl(subflow_seq); header.data_len = htons(data_len); header.csum = 0; csum = csum_partial(&header, sizeof(header), sum); return csum_fold(csum); } static __sum16 mptcp_make_csum(const struct mptcp_ext *mpext) { return __mptcp_make_csum(mpext->data_seq, mpext->subflow_seq, mpext->data_len, ~csum_unfold(mpext->csum)); } static void put_len_csum(u16 len, __sum16 csum, void *data) { __sum16 *sumptr = data + 2; __be16 *ptr = data; put_unaligned_be16(len, ptr); put_unaligned(csum, sumptr); } void mptcp_write_options(struct tcphdr *th, __be32 *ptr, struct tcp_sock *tp, struct mptcp_out_options *opts) { const struct sock *ssk = (const struct sock *)tp; struct mptcp_subflow_context *subflow; /* Which options can be used together? * * X: mutually exclusive * O: often used together * C: can be used together in some cases * P: could be used together but we prefer not to (optimisations) * * Opt: | MPC | MPJ | DSS | ADD | RM | PRIO | FAIL | FC | * ------|------|------|------|------|------|------|------|------| * MPC |------|------|------|------|------|------|------|------| * MPJ | X |------|------|------|------|------|------|------| * DSS | X | X |------|------|------|------|------|------| * ADD | X | X | P |------|------|------|------|------| * RM | C | C | C | P |------|------|------|------| * PRIO | X | C | C | C | C |------|------|------| * FAIL | X | X | C | X | X | X |------|------| * FC | X | X | X | X | X | X | X |------| * RST | X | X | X | X | X | X | O | O | * ------|------|------|------|------|------|------|------|------| * * The same applies in mptcp_established_options() function. */ if (likely(OPTION_MPTCP_DSS & opts->suboptions)) { struct mptcp_ext *mpext = &opts->ext_copy; u8 len = TCPOLEN_MPTCP_DSS_BASE; u8 flags = 0; if (mpext->use_ack) { flags = MPTCP_DSS_HAS_ACK; if (mpext->ack64) { len += TCPOLEN_MPTCP_DSS_ACK64; flags |= MPTCP_DSS_ACK64; } else { len += TCPOLEN_MPTCP_DSS_ACK32; } } if (mpext->use_map) { len += TCPOLEN_MPTCP_DSS_MAP64; /* Use only 64-bit mapping flags for now, add * support for optional 32-bit mappings later. */ flags |= MPTCP_DSS_HAS_MAP | MPTCP_DSS_DSN64; if (mpext->data_fin) flags |= MPTCP_DSS_DATA_FIN; if (opts->csum_reqd) len += TCPOLEN_MPTCP_DSS_CHECKSUM; } *ptr++ = mptcp_option(MPTCPOPT_DSS, len, 0, flags); if (mpext->use_ack) { if (mpext->ack64) { put_unaligned_be64(mpext->data_ack, ptr); ptr += 2; } else { put_unaligned_be32(mpext->data_ack32, ptr); ptr += 1; } } if (mpext->use_map) { put_unaligned_be64(mpext->data_seq, ptr); ptr += 2; put_unaligned_be32(mpext->subflow_seq, ptr); ptr += 1; if (opts->csum_reqd) { /* data_len == 0 is reserved for the infinite mapping, * the checksum will also be set to 0. */ put_len_csum(mpext->data_len, (mpext->data_len ? mptcp_make_csum(mpext) : 0), ptr); } else { put_unaligned_be32(mpext->data_len << 16 | TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); } ptr += 1; } /* We might need to add MP_FAIL options in rare cases */ if (unlikely(OPTION_MPTCP_FAIL & opts->suboptions)) goto mp_fail; } else if (OPTIONS_MPTCP_MPC & opts->suboptions) { u8 len, flag = MPTCP_CAP_HMAC_SHA256; if (OPTION_MPTCP_MPC_SYN & opts->suboptions) { len = TCPOLEN_MPTCP_MPC_SYN; } else if (OPTION_MPTCP_MPC_SYNACK & opts->suboptions) { len = TCPOLEN_MPTCP_MPC_SYNACK; } else if (opts->data_len) { len = TCPOLEN_MPTCP_MPC_ACK_DATA; if (opts->csum_reqd) len += TCPOLEN_MPTCP_DSS_CHECKSUM; } else { len = TCPOLEN_MPTCP_MPC_ACK; } if (opts->csum_reqd) flag |= MPTCP_CAP_CHECKSUM_REQD; if (!opts->allow_join_id0) flag |= MPTCP_CAP_DENY_JOIN_ID0; *ptr++ = mptcp_option(MPTCPOPT_MP_CAPABLE, len, MPTCP_SUPPORTED_VERSION, flag); if (!((OPTION_MPTCP_MPC_SYNACK | OPTION_MPTCP_MPC_ACK) & opts->suboptions)) goto mp_capable_done; put_unaligned_be64(opts->sndr_key, ptr); ptr += 2; if (!((OPTION_MPTCP_MPC_ACK) & opts->suboptions)) goto mp_capable_done; put_unaligned_be64(opts->rcvr_key, ptr); ptr += 2; if (!opts->data_len) goto mp_capable_done; if (opts->csum_reqd) { put_len_csum(opts->data_len, __mptcp_make_csum(opts->data_seq, opts->subflow_seq, opts->data_len, ~csum_unfold(opts->csum)), ptr); } else { put_unaligned_be32(opts->data_len << 16 | TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); } ptr += 1; /* MPC is additionally mutually exclusive with MP_PRIO */ goto mp_capable_done; } else if (OPTIONS_MPTCP_MPJ & opts->suboptions) { if (OPTION_MPTCP_MPJ_SYN & opts->suboptions) { *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, TCPOLEN_MPTCP_MPJ_SYN, opts->backup, opts->join_id); put_unaligned_be32(opts->token, ptr); ptr += 1; put_unaligned_be32(opts->nonce, ptr); ptr += 1; } else if (OPTION_MPTCP_MPJ_SYNACK & opts->suboptions) { *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, TCPOLEN_MPTCP_MPJ_SYNACK, opts->backup, opts->join_id); put_unaligned_be64(opts->thmac, ptr); ptr += 2; put_unaligned_be32(opts->nonce, ptr); ptr += 1; } else { *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, TCPOLEN_MPTCP_MPJ_ACK, 0, 0); memcpy(ptr, opts->hmac, MPTCPOPT_HMAC_LEN); ptr += 5; } } else if (OPTION_MPTCP_ADD_ADDR & opts->suboptions) { u8 len = TCPOLEN_MPTCP_ADD_ADDR_BASE; u8 echo = MPTCP_ADDR_ECHO; #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (opts->addr.family == AF_INET6) len = TCPOLEN_MPTCP_ADD_ADDR6_BASE; #endif if (opts->addr.port) len += TCPOLEN_MPTCP_PORT_LEN; if (opts->ahmac) { len += sizeof(opts->ahmac); echo = 0; } *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, len, echo, opts->addr.id); if (opts->addr.family == AF_INET) { memcpy((u8 *)ptr, (u8 *)&opts->addr.addr.s_addr, 4); ptr += 1; } #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (opts->addr.family == AF_INET6) { memcpy((u8 *)ptr, opts->addr.addr6.s6_addr, 16); ptr += 4; } #endif if (!opts->addr.port) { if (opts->ahmac) { put_unaligned_be64(opts->ahmac, ptr); ptr += 2; } } else { u16 port = ntohs(opts->addr.port); if (opts->ahmac) { u8 *bptr = (u8 *)ptr; put_unaligned_be16(port, bptr); bptr += 2; put_unaligned_be64(opts->ahmac, bptr); bptr += 8; put_unaligned_be16(TCPOPT_NOP << 8 | TCPOPT_NOP, bptr); ptr += 3; } else { put_unaligned_be32(port << 16 | TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); ptr += 1; } } } else if (unlikely(OPTION_MPTCP_FASTCLOSE & opts->suboptions)) { /* FASTCLOSE is mutually exclusive with others except RST */ *ptr++ = mptcp_option(MPTCPOPT_MP_FASTCLOSE, TCPOLEN_MPTCP_FASTCLOSE, 0, 0); put_unaligned_be64(opts->rcvr_key, ptr); ptr += 2; if (OPTION_MPTCP_RST & opts->suboptions) goto mp_rst; return; } else if (unlikely(OPTION_MPTCP_FAIL & opts->suboptions)) { mp_fail: /* MP_FAIL is mutually exclusive with others except RST */ subflow = mptcp_subflow_ctx(ssk); subflow->send_mp_fail = 0; *ptr++ = mptcp_option(MPTCPOPT_MP_FAIL, TCPOLEN_MPTCP_FAIL, 0, 0); put_unaligned_be64(opts->fail_seq, ptr); ptr += 2; if (OPTION_MPTCP_RST & opts->suboptions) goto mp_rst; return; } else if (unlikely(OPTION_MPTCP_RST & opts->suboptions)) { mp_rst: *ptr++ = mptcp_option(MPTCPOPT_RST, TCPOLEN_MPTCP_RST, opts->reset_transient, opts->reset_reason); return; } else if (unlikely(!opts->suboptions)) { /* Fallback to TCP */ mptcp_track_rwin(tp); return; } if (OPTION_MPTCP_PRIO & opts->suboptions) { subflow = mptcp_subflow_ctx(ssk); subflow->send_mp_prio = 0; *ptr++ = mptcp_option(MPTCPOPT_MP_PRIO, TCPOLEN_MPTCP_PRIO, opts->backup, TCPOPT_NOP); MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPPRIOTX); } mp_capable_done: if (OPTION_MPTCP_RM_ADDR & opts->suboptions) { u8 i = 1; *ptr++ = mptcp_option(MPTCPOPT_RM_ADDR, TCPOLEN_MPTCP_RM_ADDR_BASE + opts->rm_list.nr, 0, opts->rm_list.ids[0]); while (i < opts->rm_list.nr) { u8 id1, id2, id3, id4; id1 = opts->rm_list.ids[i]; id2 = i + 1 < opts->rm_list.nr ? opts->rm_list.ids[i + 1] : TCPOPT_NOP; id3 = i + 2 < opts->rm_list.nr ? opts->rm_list.ids[i + 2] : TCPOPT_NOP; id4 = i + 3 < opts->rm_list.nr ? opts->rm_list.ids[i + 3] : TCPOPT_NOP; put_unaligned_be32(id1 << 24 | id2 << 16 | id3 << 8 | id4, ptr); ptr += 1; i += 4; } } if (tp) mptcp_set_rwin(tp, th); } __be32 mptcp_get_reset_option(const struct sk_buff *skb) { const struct mptcp_ext *ext = mptcp_get_ext(skb); u8 flags, reason; if (ext) { flags = ext->reset_transient; reason = ext->reset_reason; return mptcp_option(MPTCPOPT_RST, TCPOLEN_MPTCP_RST, flags, reason); } return htonl(0u); } EXPORT_SYMBOL_GPL(mptcp_get_reset_option);
1 1 1 1 1 1 1 1 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2015 HGST, a Western Digital Company. */ #include <linux/err.h> #include <linux/slab.h> #include <rdma/ib_verbs.h> #include "core_priv.h" #include <trace/events/rdma_core.h> /* Max size for shared CQ, may require tuning */ #define IB_MAX_SHARED_CQ_SZ 4096U /* # of WCs to poll for with a single call to ib_poll_cq */ #define IB_POLL_BATCH 16 #define IB_POLL_BATCH_DIRECT 8 /* # of WCs to iterate over before yielding */ #define IB_POLL_BUDGET_IRQ 256 #define IB_POLL_BUDGET_WORKQUEUE 65536 #define IB_POLL_FLAGS \ (IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS) static const struct dim_cq_moder rdma_dim_prof[RDMA_DIM_PARAMS_NUM_PROFILES] = { {1, 0, 1, 0}, {1, 0, 4, 0}, {2, 0, 4, 0}, {2, 0, 8, 0}, {4, 0, 8, 0}, {16, 0, 8, 0}, {16, 0, 16, 0}, {32, 0, 16, 0}, {32, 0, 32, 0}, }; static void ib_cq_rdma_dim_work(struct work_struct *w) { struct dim *dim = container_of(w, struct dim, work); struct ib_cq *cq = dim->priv; u16 usec = rdma_dim_prof[dim->profile_ix].usec; u16 comps = rdma_dim_prof[dim->profile_ix].comps; dim->state = DIM_START_MEASURE; trace_cq_modify(cq, comps, usec); cq->device->ops.modify_cq(cq, comps, usec); } static void rdma_dim_init(struct ib_cq *cq) { struct dim *dim; if (!cq->device->ops.modify_cq || !cq->device->use_cq_dim || cq->poll_ctx == IB_POLL_DIRECT) return; dim = kzalloc(sizeof(struct dim), GFP_KERNEL); if (!dim) return; dim->state = DIM_START_MEASURE; dim->tune_state = DIM_GOING_RIGHT; dim->profile_ix = RDMA_DIM_START_PROFILE; dim->priv = cq; cq->dim = dim; INIT_WORK(&dim->work, ib_cq_rdma_dim_work); } static void rdma_dim_destroy(struct ib_cq *cq) { if (!cq->dim) return; cancel_work_sync(&cq->dim->work); kfree(cq->dim); } static int __poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc) { int rc; rc = ib_poll_cq(cq, num_entries, wc); trace_cq_poll(cq, num_entries, rc); return rc; } static int __ib_process_cq(struct ib_cq *cq, int budget, struct ib_wc *wcs, int batch) { int i, n, completed = 0; trace_cq_process(cq); /* * budget might be (-1) if the caller does not * want to bound this call, thus we need unsigned * minimum here. */ while ((n = __poll_cq(cq, min_t(u32, batch, budget - completed), wcs)) > 0) { for (i = 0; i < n; i++) { struct ib_wc *wc = &wcs[i]; if (wc->wr_cqe) wc->wr_cqe->done(cq, wc); else WARN_ON_ONCE(wc->status == IB_WC_SUCCESS); } completed += n; if (n != batch || (budget != -1 && completed >= budget)) break; } return completed; } /** * ib_process_cq_direct - process a CQ in caller context * @cq: CQ to process * @budget: number of CQEs to poll for * * This function is used to process all outstanding CQ entries. * It does not offload CQ processing to a different context and does * not ask for completion interrupts from the HCA. * Using direct processing on CQ with non IB_POLL_DIRECT type may trigger * concurrent processing. * * Note: do not pass -1 as %budget unless it is guaranteed that the number * of completions that will be processed is small. */ int ib_process_cq_direct(struct ib_cq *cq, int budget) { struct ib_wc wcs[IB_POLL_BATCH_DIRECT]; return __ib_process_cq(cq, budget, wcs, IB_POLL_BATCH_DIRECT); } EXPORT_SYMBOL(ib_process_cq_direct); static void ib_cq_completion_direct(struct ib_cq *cq, void *private) { WARN_ONCE(1, "got unsolicited completion for CQ 0x%p\n", cq); } static int ib_poll_handler(struct irq_poll *iop, int budget) { struct ib_cq *cq = container_of(iop, struct ib_cq, iop); struct dim *dim = cq->dim; int completed; completed = __ib_process_cq(cq, budget, cq->wc, IB_POLL_BATCH); if (completed < budget) { irq_poll_complete(&cq->iop); if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) { trace_cq_reschedule(cq); irq_poll_sched(&cq->iop); } } if (dim) rdma_dim(dim, completed); return completed; } static void ib_cq_completion_softirq(struct ib_cq *cq, void *private) { trace_cq_schedule(cq); irq_poll_sched(&cq->iop); } static void ib_cq_poll_work(struct work_struct *work) { struct ib_cq *cq = container_of(work, struct ib_cq, work); int completed; completed = __ib_process_cq(cq, IB_POLL_BUDGET_WORKQUEUE, cq->wc, IB_POLL_BATCH); if (completed >= IB_POLL_BUDGET_WORKQUEUE || ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) queue_work(cq->comp_wq, &cq->work); else if (cq->dim) rdma_dim(cq->dim, completed); } static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private) { trace_cq_schedule(cq); queue_work(cq->comp_wq, &cq->work); } /** * __ib_alloc_cq - allocate a completion queue * @dev: device to allocate the CQ for * @private: driver private data, accessible from cq->cq_context * @nr_cqe: number of CQEs to allocate * @comp_vector: HCA completion vectors for this CQ * @poll_ctx: context to poll the CQ from. * @caller: module owner name. * * This is the proper interface to allocate a CQ for in-kernel users. A * CQ allocated with this interface will automatically be polled from the * specified context. The ULP must use wr->wr_cqe instead of wr->wr_id * to use this CQ abstraction. */ struct ib_cq *__ib_alloc_cq(struct ib_device *dev, void *private, int nr_cqe, int comp_vector, enum ib_poll_context poll_ctx, const char *caller) { struct ib_cq_init_attr cq_attr = { .cqe = nr_cqe, .comp_vector = comp_vector, }; struct ib_cq *cq; int ret = -ENOMEM; cq = rdma_zalloc_drv_obj(dev, ib_cq); if (!cq) return ERR_PTR(ret); cq->device = dev; cq->cq_context = private; cq->poll_ctx = poll_ctx; atomic_set(&cq->usecnt, 0); cq->comp_vector = comp_vector; cq->wc = kmalloc_array(IB_POLL_BATCH, sizeof(*cq->wc), GFP_KERNEL); if (!cq->wc) goto out_free_cq; rdma_restrack_new(&cq->res, RDMA_RESTRACK_CQ); rdma_restrack_set_name(&cq->res, caller); ret = dev->ops.create_cq(cq, &cq_attr, NULL); if (ret) goto out_free_wc; rdma_dim_init(cq); switch (cq->poll_ctx) { case IB_POLL_DIRECT: cq->comp_handler = ib_cq_completion_direct; break; case IB_POLL_SOFTIRQ: cq->comp_handler = ib_cq_completion_softirq; irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ, ib_poll_handler); ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); break; case IB_POLL_WORKQUEUE: case IB_POLL_UNBOUND_WORKQUEUE: cq->comp_handler = ib_cq_completion_workqueue; INIT_WORK(&cq->work, ib_cq_poll_work); ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); cq->comp_wq = (cq->poll_ctx == IB_POLL_WORKQUEUE) ? ib_comp_wq : ib_comp_unbound_wq; break; default: ret = -EINVAL; goto out_destroy_cq; } rdma_restrack_add(&cq->res); trace_cq_alloc(cq, nr_cqe, comp_vector, poll_ctx); return cq; out_destroy_cq: rdma_dim_destroy(cq); cq->device->ops.destroy_cq(cq, NULL); out_free_wc: rdma_restrack_put(&cq->res); kfree(cq->wc); out_free_cq: kfree(cq); trace_cq_alloc_error(nr_cqe, comp_vector, poll_ctx, ret); return ERR_PTR(ret); } EXPORT_SYMBOL(__ib_alloc_cq); /** * __ib_alloc_cq_any - allocate a completion queue * @dev: device to allocate the CQ for * @private: driver private data, accessible from cq->cq_context * @nr_cqe: number of CQEs to allocate * @poll_ctx: context to poll the CQ from * @caller: module owner name * * Attempt to spread ULP Completion Queues over each device's interrupt * vectors. A simple best-effort mechanism is used. */ struct ib_cq *__ib_alloc_cq_any(struct ib_device *dev, void *private, int nr_cqe, enum ib_poll_context poll_ctx, const char *caller) { static atomic_t counter; int comp_vector = 0; if (dev->num_comp_vectors > 1) comp_vector = atomic_inc_return(&counter) % min_t(int, dev->num_comp_vectors, num_online_cpus()); return __ib_alloc_cq(dev, private, nr_cqe, comp_vector, poll_ctx, caller); } EXPORT_SYMBOL(__ib_alloc_cq_any); /** * ib_free_cq - free a completion queue * @cq: completion queue to free. */ void ib_free_cq(struct ib_cq *cq) { int ret = 0; if (WARN_ON_ONCE(atomic_read(&cq->usecnt))) return; if (WARN_ON_ONCE(cq->cqe_used)) return; if (cq->device->ops.pre_destroy_cq) { ret = cq->device->ops.pre_destroy_cq(cq); WARN_ONCE(ret, "Disable of kernel CQ shouldn't fail"); } switch (cq->poll_ctx) { case IB_POLL_DIRECT: break; case IB_POLL_SOFTIRQ: irq_poll_disable(&cq->iop); break; case IB_POLL_WORKQUEUE: case IB_POLL_UNBOUND_WORKQUEUE: cancel_work_sync(&cq->work); break; default: WARN_ON_ONCE(1); } rdma_dim_destroy(cq); trace_cq_free(cq); if (cq->device->ops.post_destroy_cq) cq->device->ops.post_destroy_cq(cq); else ret = cq->device->ops.destroy_cq(cq, NULL); WARN_ONCE(ret, "Destroy of kernel CQ shouldn't fail"); rdma_restrack_del(&cq->res); kfree(cq->wc); kfree(cq); } EXPORT_SYMBOL(ib_free_cq); void ib_cq_pool_cleanup(struct ib_device *dev) { struct ib_cq *cq, *n; unsigned int i; for (i = 0; i < ARRAY_SIZE(dev->cq_pools); i++) { list_for_each_entry_safe(cq, n, &dev->cq_pools[i], pool_entry) { WARN_ON(cq->cqe_used); list_del(&cq->pool_entry); cq->shared = false; ib_free_cq(cq); } } } static int ib_alloc_cqs(struct ib_device *dev, unsigned int nr_cqes, enum ib_poll_context poll_ctx) { LIST_HEAD(tmp_list); unsigned int nr_cqs, i; struct ib_cq *cq, *n; int ret; if (poll_ctx > IB_POLL_LAST_POOL_TYPE) { WARN_ON_ONCE(poll_ctx > IB_POLL_LAST_POOL_TYPE); return -EINVAL; } /* * Allocate at least as many CQEs as requested, and otherwise * a reasonable batch size so that we can share CQs between * multiple users instead of allocating a larger number of CQs. */ nr_cqes = min_t(unsigned int, dev->attrs.max_cqe, max(nr_cqes, IB_MAX_SHARED_CQ_SZ)); nr_cqs = min_t(unsigned int, dev->num_comp_vectors, num_online_cpus()); for (i = 0; i < nr_cqs; i++) { cq = ib_alloc_cq(dev, NULL, nr_cqes, i, poll_ctx); if (IS_ERR(cq)) { ret = PTR_ERR(cq); goto out_free_cqs; } cq->shared = true; list_add_tail(&cq->pool_entry, &tmp_list); } spin_lock_irq(&dev->cq_pools_lock); list_splice(&tmp_list, &dev->cq_pools[poll_ctx]); spin_unlock_irq(&dev->cq_pools_lock); return 0; out_free_cqs: list_for_each_entry_safe(cq, n, &tmp_list, pool_entry) { cq->shared = false; ib_free_cq(cq); } return ret; } /** * ib_cq_pool_get() - Find the least used completion queue that matches * a given cpu hint (or least used for wild card affinity) and fits * nr_cqe. * @dev: rdma device * @nr_cqe: number of needed cqe entries * @comp_vector_hint: completion vector hint (-1) for the driver to assign * a comp vector based on internal counter * @poll_ctx: cq polling context * * Finds a cq that satisfies @comp_vector_hint and @nr_cqe requirements and * claim entries in it for us. In case there is no available cq, allocate * a new cq with the requirements and add it to the device pool. * IB_POLL_DIRECT cannot be used for shared cqs so it is not a valid value * for @poll_ctx. */ struct ib_cq *ib_cq_pool_get(struct ib_device *dev, unsigned int nr_cqe, int comp_vector_hint, enum ib_poll_context poll_ctx) { static unsigned int default_comp_vector; unsigned int vector, num_comp_vectors; struct ib_cq *cq, *found = NULL; int ret; if (poll_ctx > IB_POLL_LAST_POOL_TYPE) { WARN_ON_ONCE(poll_ctx > IB_POLL_LAST_POOL_TYPE); return ERR_PTR(-EINVAL); } num_comp_vectors = min_t(unsigned int, dev->num_comp_vectors, num_online_cpus()); /* Project the affinty to the device completion vector range */ if (comp_vector_hint < 0) { comp_vector_hint = (READ_ONCE(default_comp_vector) + 1) % num_comp_vectors; WRITE_ONCE(default_comp_vector, comp_vector_hint); } vector = comp_vector_hint % num_comp_vectors; /* * Find the least used CQ with correct affinity and * enough free CQ entries */ while (!found) { spin_lock_irq(&dev->cq_pools_lock); list_for_each_entry(cq, &dev->cq_pools[poll_ctx], pool_entry) { /* * Check to see if we have found a CQ with the * correct completion vector */ if (vector != cq->comp_vector) continue; if (cq->cqe_used + nr_cqe > cq->cqe) continue; found = cq; break; } if (found) { found->cqe_used += nr_cqe; spin_unlock_irq(&dev->cq_pools_lock); return found; } spin_unlock_irq(&dev->cq_pools_lock); /* * Didn't find a match or ran out of CQs in the device * pool, allocate a new array of CQs. */ ret = ib_alloc_cqs(dev, nr_cqe, poll_ctx); if (ret) return ERR_PTR(ret); } return found; } EXPORT_SYMBOL(ib_cq_pool_get); /** * ib_cq_pool_put - Return a CQ taken from a shared pool. * @cq: The CQ to return. * @nr_cqe: The max number of cqes that the user had requested. */ void ib_cq_pool_put(struct ib_cq *cq, unsigned int nr_cqe) { if (WARN_ON_ONCE(nr_cqe > cq->cqe_used)) return; spin_lock_irq(&cq->device->cq_pools_lock); cq->cqe_used -= nr_cqe; spin_unlock_irq(&cq->device->cq_pools_lock); } EXPORT_SYMBOL(ib_cq_pool_put);
8 13 13 8 8 8 16 3 13 8 5 8 34 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 // SPDX-License-Identifier: GPL-2.0-only /* * (C) 2007 Patrick McHardy <kaber@trash.net> */ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/gen_stats.h> #include <linux/jhash.h> #include <linux/rtnetlink.h> #include <linux/random.h> #include <linux/slab.h> #include <net/gen_stats.h> #include <net/netlink.h> #include <net/netns/generic.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_RATEEST.h> #include <net/netfilter/xt_rateest.h> #define RATEEST_HSIZE 16 struct xt_rateest_net { struct mutex hash_lock; struct hlist_head hash[RATEEST_HSIZE]; }; static unsigned int xt_rateest_id; static unsigned int jhash_rnd __read_mostly; static unsigned int xt_rateest_hash(const char *name) { return jhash(name, sizeof_field(struct xt_rateest, name), jhash_rnd) & (RATEEST_HSIZE - 1); } static void xt_rateest_hash_insert(struct xt_rateest_net *xn, struct xt_rateest *est) { unsigned int h; h = xt_rateest_hash(est->name); hlist_add_head(&est->list, &xn->hash[h]); } static struct xt_rateest *__xt_rateest_lookup(struct xt_rateest_net *xn, const char *name) { struct xt_rateest *est; unsigned int h; h = xt_rateest_hash(name); hlist_for_each_entry(est, &xn->hash[h], list) { if (strcmp(est->name, name) == 0) { est->refcnt++; return est; } } return NULL; } struct xt_rateest *xt_rateest_lookup(struct net *net, const char *name) { struct xt_rateest_net *xn = net_generic(net, xt_rateest_id); struct xt_rateest *est; mutex_lock(&xn->hash_lock); est = __xt_rateest_lookup(xn, name); mutex_unlock(&xn->hash_lock); return est; } EXPORT_SYMBOL_GPL(xt_rateest_lookup); void xt_rateest_put(struct net *net, struct xt_rateest *est) { struct xt_rateest_net *xn = net_generic(net, xt_rateest_id); mutex_lock(&xn->hash_lock); if (--est->refcnt == 0) { hlist_del(&est->list); gen_kill_estimator(&est->rate_est); /* * gen_estimator est_timer() might access est->lock or bstats, * wait a RCU grace period before freeing 'est' */ kfree_rcu(est, rcu); } mutex_unlock(&xn->hash_lock); } EXPORT_SYMBOL_GPL(xt_rateest_put); static unsigned int xt_rateest_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_rateest_target_info *info = par->targinfo; struct gnet_stats_basic_sync *stats = &info->est->bstats; spin_lock_bh(&info->est->lock); u64_stats_add(&stats->bytes, skb->len); u64_stats_inc(&stats->packets); spin_unlock_bh(&info->est->lock); return XT_CONTINUE; } static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par) { struct xt_rateest_net *xn = net_generic(par->net, xt_rateest_id); struct xt_rateest_target_info *info = par->targinfo; struct xt_rateest *est; struct { struct nlattr opt; struct gnet_estimator est; } cfg; int ret; if (strnlen(info->name, sizeof(est->name)) >= sizeof(est->name)) return -ENAMETOOLONG; net_get_random_once(&jhash_rnd, sizeof(jhash_rnd)); mutex_lock(&xn->hash_lock); est = __xt_rateest_lookup(xn, info->name); if (est) { mutex_unlock(&xn->hash_lock); /* * If estimator parameters are specified, they must match the * existing estimator. */ if ((!info->interval && !info->ewma_log) || (info->interval != est->params.interval || info->ewma_log != est->params.ewma_log)) { xt_rateest_put(par->net, est); return -EINVAL; } info->est = est; return 0; } ret = -ENOMEM; est = kzalloc(sizeof(*est), GFP_KERNEL); if (!est) goto err1; gnet_stats_basic_sync_init(&est->bstats); strscpy(est->name, info->name, sizeof(est->name)); spin_lock_init(&est->lock); est->refcnt = 1; est->params.interval = info->interval; est->params.ewma_log = info->ewma_log; cfg.opt.nla_len = nla_attr_size(sizeof(cfg.est)); cfg.opt.nla_type = TCA_STATS_RATE_EST; cfg.est.interval = info->interval; cfg.est.ewma_log = info->ewma_log; ret = gen_new_estimator(&est->bstats, NULL, &est->rate_est, &est->lock, NULL, &cfg.opt); if (ret < 0) goto err2; info->est = est; xt_rateest_hash_insert(xn, est); mutex_unlock(&xn->hash_lock); return 0; err2: kfree(est); err1: mutex_unlock(&xn->hash_lock); return ret; } static void xt_rateest_tg_destroy(const struct xt_tgdtor_param *par) { struct xt_rateest_target_info *info = par->targinfo; xt_rateest_put(par->net, info->est); } static struct xt_target xt_rateest_tg_reg[] __read_mostly = { { .name = "RATEEST", .revision = 0, .family = NFPROTO_IPV4, .target = xt_rateest_tg, .checkentry = xt_rateest_tg_checkentry, .destroy = xt_rateest_tg_destroy, .targetsize = sizeof(struct xt_rateest_target_info), .usersize = offsetof(struct xt_rateest_target_info, est), .me = THIS_MODULE, }, #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { .name = "RATEEST", .revision = 0, .family = NFPROTO_IPV6, .target = xt_rateest_tg, .checkentry = xt_rateest_tg_checkentry, .destroy = xt_rateest_tg_destroy, .targetsize = sizeof(struct xt_rateest_target_info), .usersize = offsetof(struct xt_rateest_target_info, est), .me = THIS_MODULE, }, #endif }; static __net_init int xt_rateest_net_init(struct net *net) { struct xt_rateest_net *xn = net_generic(net, xt_rateest_id); int i; mutex_init(&xn->hash_lock); for (i = 0; i < ARRAY_SIZE(xn->hash); i++) INIT_HLIST_HEAD(&xn->hash[i]); return 0; } static struct pernet_operations xt_rateest_net_ops = { .init = xt_rateest_net_init, .id = &xt_rateest_id, .size = sizeof(struct xt_rateest_net), }; static int __init xt_rateest_tg_init(void) { int err = register_pernet_subsys(&xt_rateest_net_ops); if (err) return err; return xt_register_targets(xt_rateest_tg_reg, ARRAY_SIZE(xt_rateest_tg_reg)); } static void __exit xt_rateest_tg_fini(void) { xt_unregister_targets(xt_rateest_tg_reg, ARRAY_SIZE(xt_rateest_tg_reg)); unregister_pernet_subsys(&xt_rateest_net_ops); } MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Xtables: packet rate estimator"); MODULE_ALIAS("ipt_RATEEST"); MODULE_ALIAS("ip6t_RATEEST"); module_init(xt_rateest_tg_init); module_exit(xt_rateest_tg_fini);
2 2 2 2 2 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _MM_SWAP_TABLE_H #define _MM_SWAP_TABLE_H #include <linux/rcupdate.h> #include <linux/atomic.h> #include "swap.h" /* A typical flat array in each cluster as swap table */ struct swap_table { atomic_long_t entries[SWAPFILE_CLUSTER]; }; #define SWP_TABLE_USE_PAGE (sizeof(struct swap_table) == PAGE_SIZE) /* * A swap table entry represents the status of a swap slot on a swap * (physical or virtual) device. The swap table in each cluster is a * 1:1 map of the swap slots in this cluster. * * Each swap table entry could be a pointer (folio), a XA_VALUE * (shadow), or NULL. */ /* * Helpers for casting one type of info into a swap table entry. */ static inline unsigned long null_to_swp_tb(void) { BUILD_BUG_ON(sizeof(unsigned long) != sizeof(atomic_long_t)); return 0; } static inline unsigned long folio_to_swp_tb(struct folio *folio) { BUILD_BUG_ON(sizeof(unsigned long) != sizeof(void *)); return (unsigned long)folio; } static inline unsigned long shadow_swp_to_tb(void *shadow) { BUILD_BUG_ON((BITS_PER_XA_VALUE + 1) != BITS_PER_BYTE * sizeof(unsigned long)); VM_WARN_ON_ONCE(shadow && !xa_is_value(shadow)); return (unsigned long)shadow; } /* * Helpers for swap table entry type checking. */ static inline bool swp_tb_is_null(unsigned long swp_tb) { return !swp_tb; } static inline bool swp_tb_is_folio(unsigned long swp_tb) { return !xa_is_value((void *)swp_tb) && !swp_tb_is_null(swp_tb); } static inline bool swp_tb_is_shadow(unsigned long swp_tb) { return xa_is_value((void *)swp_tb); } /* * Helpers for retrieving info from swap table. */ static inline struct folio *swp_tb_to_folio(unsigned long swp_tb) { VM_WARN_ON(!swp_tb_is_folio(swp_tb)); return (void *)swp_tb; } static inline void *swp_tb_to_shadow(unsigned long swp_tb) { VM_WARN_ON(!swp_tb_is_shadow(swp_tb)); return (void *)swp_tb; } /* * Helpers for accessing or modifying the swap table of a cluster, * the swap cluster must be locked. */ static inline void __swap_table_set(struct swap_cluster_info *ci, unsigned int off, unsigned long swp_tb) { atomic_long_t *table = rcu_dereference_protected(ci->table, true); lockdep_assert_held(&ci->lock); VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER); atomic_long_set(&table[off], swp_tb); } static inline unsigned long __swap_table_xchg(struct swap_cluster_info *ci, unsigned int off, unsigned long swp_tb) { atomic_long_t *table = rcu_dereference_protected(ci->table, true); lockdep_assert_held(&ci->lock); VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER); /* Ordering is guaranteed by cluster lock, relax */ return atomic_long_xchg_relaxed(&table[off], swp_tb); } static inline unsigned long __swap_table_get(struct swap_cluster_info *ci, unsigned int off) { atomic_long_t *table; VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER); table = rcu_dereference_check(ci->table, lockdep_is_held(&ci->lock)); return atomic_long_read(&table[off]); } static inline unsigned long swap_table_get(struct swap_cluster_info *ci, unsigned int off) { atomic_long_t *table; unsigned long swp_tb; rcu_read_lock(); table = rcu_dereference(ci->table); swp_tb = table ? atomic_long_read(&table[off]) : null_to_swp_tb(); rcu_read_unlock(); return swp_tb; } #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_VMALLOC_H #define _LINUX_VMALLOC_H #include <linux/alloc_tag.h> #include <linux/sched.h> #include <linux/spinlock.h> #include <linux/init.h> #include <linux/list.h> #include <linux/llist.h> #include <asm/page.h> /* pgprot_t */ #include <linux/rbtree.h> #include <linux/overflow.h> #include <asm/vmalloc.h> struct vm_area_struct; /* vma defining user mapping in mm_types.h */ struct notifier_block; /* in notifier.h */ struct iov_iter; /* in uio.h */ /* bits in flags of vmalloc's vm_struct below */ #define VM_IOREMAP 0x00000001 /* ioremap() and friends */ #define VM_ALLOC 0x00000002 /* vmalloc() */ #define VM_MAP 0x00000004 /* vmap()ed pages */ #define VM_USERMAP 0x00000008 /* suitable for remap_vmalloc_range */ #define VM_DMA_COHERENT 0x00000010 /* dma_alloc_coherent */ #define VM_UNINITIALIZED 0x00000020 /* vm_struct is not fully initialized */ #define VM_NO_GUARD 0x00000040 /* ***DANGEROUS*** don't add guard page */ #define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */ #define VM_FLUSH_RESET_PERMS 0x00000100 /* reset direct map and flush TLB on unmap, can't be freed in atomic context */ #define VM_MAP_PUT_PAGES 0x00000200 /* put pages and free array in vfree */ #define VM_ALLOW_HUGE_VMAP 0x00000400 /* Allow for huge pages on archs with HAVE_ARCH_HUGE_VMALLOC */ #if (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) && \ !defined(CONFIG_KASAN_VMALLOC) #define VM_DEFER_KMEMLEAK 0x00000800 /* defer kmemleak object creation */ #else #define VM_DEFER_KMEMLEAK 0 #endif #define VM_SPARSE 0x00001000 /* sparse vm_area. not all pages are present. */ /* bits [20..32] reserved for arch specific ioremap internals */ /* * Maximum alignment for ioremap() regions. * Can be overridden by arch-specific value. */ #ifndef IOREMAP_MAX_ORDER #define IOREMAP_MAX_ORDER (7 + PAGE_SHIFT) /* 128 pages */ #endif struct vm_struct { union { struct vm_struct *next; /* Early registration of vm_areas. */ struct llist_node llnode; /* Asynchronous freeing on error paths. */ }; void *addr; unsigned long size; unsigned long flags; struct page **pages; #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC unsigned int page_order; #endif unsigned int nr_pages; phys_addr_t phys_addr; const void *caller; unsigned long requested_size; }; struct vmap_area { unsigned long va_start; unsigned long va_end; struct rb_node rb_node; /* address sorted rbtree */ struct list_head list; /* address sorted list */ /* * The following two variables can be packed, because * a vmap_area object can be either: * 1) in "free" tree (root is free_vmap_area_root) * 2) or "busy" tree (root is vmap_area_root) */ union { unsigned long subtree_max_size; /* in "free" tree */ struct vm_struct *vm; /* in "busy" tree */ }; unsigned long flags; /* mark type of vm_map_ram area */ }; /* archs that select HAVE_ARCH_HUGE_VMAP should override one or more of these */ #ifndef arch_vmap_p4d_supported static inline bool arch_vmap_p4d_supported(pgprot_t prot) { return false; } #endif #ifndef arch_vmap_pud_supported static inline bool arch_vmap_pud_supported(pgprot_t prot) { return false; } #endif #ifndef arch_vmap_pmd_supported static inline bool arch_vmap_pmd_supported(pgprot_t prot) { return false; } #endif #ifndef arch_vmap_pte_range_map_size static inline unsigned long arch_vmap_pte_range_map_size(unsigned long addr, unsigned long end, u64 pfn, unsigned int max_page_shift) { return PAGE_SIZE; } #endif #ifndef arch_vmap_pte_range_unmap_size static inline unsigned long arch_vmap_pte_range_unmap_size(unsigned long addr, pte_t *ptep) { return PAGE_SIZE; } #endif #ifndef arch_vmap_pte_supported_shift static inline int arch_vmap_pte_supported_shift(unsigned long size) { return PAGE_SHIFT; } #endif #ifndef arch_vmap_pgprot_tagged static inline pgprot_t arch_vmap_pgprot_tagged(pgprot_t prot) { return prot; } #endif /* * Highlevel APIs for driver use */ extern void vm_unmap_ram(const void *mem, unsigned int count); extern void *vm_map_ram(struct page **pages, unsigned int count, int node); extern void vm_unmap_aliases(void); extern void *vmalloc_noprof(unsigned long size) __alloc_size(1); #define vmalloc(...) alloc_hooks(vmalloc_noprof(__VA_ARGS__)) extern void *vzalloc_noprof(unsigned long size) __alloc_size(1); #define vzalloc(...) alloc_hooks(vzalloc_noprof(__VA_ARGS__)) extern void *vmalloc_user_noprof(unsigned long size) __alloc_size(1); #define vmalloc_user(...) alloc_hooks(vmalloc_user_noprof(__VA_ARGS__)) extern void *vmalloc_node_noprof(unsigned long size, int node) __alloc_size(1); #define vmalloc_node(...) alloc_hooks(vmalloc_node_noprof(__VA_ARGS__)) extern void *vzalloc_node_noprof(unsigned long size, int node) __alloc_size(1); #define vzalloc_node(...) alloc_hooks(vzalloc_node_noprof(__VA_ARGS__)) extern void *vmalloc_32_noprof(unsigned long size) __alloc_size(1); #define vmalloc_32(...) alloc_hooks(vmalloc_32_noprof(__VA_ARGS__)) extern void *vmalloc_32_user_noprof(unsigned long size) __alloc_size(1); #define vmalloc_32_user(...) alloc_hooks(vmalloc_32_user_noprof(__VA_ARGS__)) extern void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask) __alloc_size(1); #define __vmalloc(...) alloc_hooks(__vmalloc_noprof(__VA_ARGS__)) extern void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, pgprot_t prot, unsigned long vm_flags, int node, const void *caller) __alloc_size(1); #define __vmalloc_node_range(...) alloc_hooks(__vmalloc_node_range_noprof(__VA_ARGS__)) void *__vmalloc_node_noprof(unsigned long size, unsigned long align, gfp_t gfp_mask, int node, const void *caller) __alloc_size(1); #define __vmalloc_node(...) alloc_hooks(__vmalloc_node_noprof(__VA_ARGS__)) void *vmalloc_huge_node_noprof(unsigned long size, gfp_t gfp_mask, int node) __alloc_size(1); #define vmalloc_huge_node(...) alloc_hooks(vmalloc_huge_node_noprof(__VA_ARGS__)) static inline void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) { return vmalloc_huge_node(size, gfp_mask, NUMA_NO_NODE); } extern void *__vmalloc_array_noprof(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2); #define __vmalloc_array(...) alloc_hooks(__vmalloc_array_noprof(__VA_ARGS__)) extern void *vmalloc_array_noprof(size_t n, size_t size) __alloc_size(1, 2); #define vmalloc_array(...) alloc_hooks(vmalloc_array_noprof(__VA_ARGS__)) extern void *__vcalloc_noprof(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2); #define __vcalloc(...) alloc_hooks(__vcalloc_noprof(__VA_ARGS__)) extern void *vcalloc_noprof(size_t n, size_t size) __alloc_size(1, 2); #define vcalloc(...) alloc_hooks(vcalloc_noprof(__VA_ARGS__)) void *__must_check vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align, gfp_t flags, int nid) __realloc_size(2); #define vrealloc_node_noprof(_p, _s, _f, _nid) \ vrealloc_node_align_noprof(_p, _s, 1, _f, _nid) #define vrealloc_noprof(_p, _s, _f) \ vrealloc_node_align_noprof(_p, _s, 1, _f, NUMA_NO_NODE) #define vrealloc_node_align(...) alloc_hooks(vrealloc_node_align_noprof(__VA_ARGS__)) #define vrealloc_node(...) alloc_hooks(vrealloc_node_noprof(__VA_ARGS__)) #define vrealloc(...) alloc_hooks(vrealloc_noprof(__VA_ARGS__)) extern void vfree(const void *addr); extern void vfree_atomic(const void *addr); extern void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot); void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot); extern void vunmap(const void *addr); extern int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, void *kaddr, unsigned long pgoff, unsigned long size); extern int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, unsigned long pgoff); int vmap_pages_range(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift); /* * Lowlevel-APIs (not for driver use!) */ static inline size_t get_vm_area_size(const struct vm_struct *area) { if (!(area->flags & VM_NO_GUARD)) /* return actual size without guard page */ return area->size - PAGE_SIZE; else return area->size; } extern struct vm_struct *get_vm_area(unsigned long size, unsigned long flags); extern struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, const void *caller); extern struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, unsigned long start, unsigned long end, const void *caller); void free_vm_area(struct vm_struct *area); extern struct vm_struct *remove_vm_area(const void *addr); extern struct vm_struct *find_vm_area(const void *addr); struct vmap_area *find_vmap_area(unsigned long addr); static inline bool is_vm_area_hugepages(const void *addr) { /* * This may not 100% tell if the area is mapped with > PAGE_SIZE * page table entries, if for some reason the architecture indicates * larger sizes are available but decides not to use them, nothing * prevents that. This only indicates the size of the physical page * allocated in the vmalloc layer. */ #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC return find_vm_area(addr)->page_order > 0; #else return false; #endif } /* for /proc/kcore */ long vread_iter(struct iov_iter *iter, const char *addr, size_t count); /* * Internals. Don't use.. */ __init void vm_area_add_early(struct vm_struct *vm); __init void vm_area_register_early(struct vm_struct *vm, size_t align); int register_vmap_purge_notifier(struct notifier_block *nb); int unregister_vmap_purge_notifier(struct notifier_block *nb); #ifdef CONFIG_MMU #define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START) unsigned long vmalloc_nr_pages(void); int vm_area_map_pages(struct vm_struct *area, unsigned long start, unsigned long end, struct page **pages); void vm_area_unmap_pages(struct vm_struct *area, unsigned long start, unsigned long end); void vunmap_range(unsigned long addr, unsigned long end); static inline void set_vm_flush_reset_perms(void *addr) { struct vm_struct *vm = find_vm_area(addr); if (vm) vm->flags |= VM_FLUSH_RESET_PERMS; } #else /* !CONFIG_MMU */ #define VMALLOC_TOTAL 0UL static inline unsigned long vmalloc_nr_pages(void) { return 0; } static inline void set_vm_flush_reset_perms(void *addr) {} #endif /* CONFIG_MMU */ #if defined(CONFIG_MMU) && defined(CONFIG_SMP) struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, const size_t *sizes, int nr_vms, size_t align); void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms); # else static inline struct vm_struct ** pcpu_get_vm_areas(const unsigned long *offsets, const size_t *sizes, int nr_vms, size_t align) { return NULL; } static inline void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) {} #endif #if defined(CONFIG_MMU) && defined(CONFIG_PRINTK) bool vmalloc_dump_obj(void *object); #else static inline bool vmalloc_dump_obj(void *object) { return false; } #endif unsigned int memalloc_apply_gfp_scope(gfp_t gfp_mask); void memalloc_restore_scope(unsigned int flags); #endif /* _LINUX_VMALLOC_H */
2 18 18 13 5 7 3 4 1 3 8 8 2 2 8 11 12 12 2 3 7 2 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:ip type */ #include <linux/jhash.h> #include <linux/module.h> #include <linux/ip.h> #include <linux/skbuff.h> #include <linux/errno.h> #include <linux/random.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/netlink.h> #include <net/tcp.h> #include <linux/netfilter.h> #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> #include <linux/netfilter/ipset/ip_set_hash.h> #define IPSET_TYPE_REV_MIN 0 /* 1 Counters support */ /* 2 Comments support */ /* 3 Forceadd support */ /* 4 skbinfo support */ /* 5 bucketsize, initval support */ #define IPSET_TYPE_REV_MAX 6 /* bitmask support */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("hash:ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:ip"); /* Type specific function prefix */ #define HTYPE hash_ip #define IP_SET_HASH_WITH_NETMASK #define IP_SET_HASH_WITH_BITMASK /* IPv4 variant */ /* Member elements */ struct hash_ip4_elem { /* Zero valued IP addresses cannot be stored */ __be32 ip; }; /* Common functions */ static bool hash_ip4_data_equal(const struct hash_ip4_elem *e1, const struct hash_ip4_elem *e2, u32 *multi) { return e1->ip == e2->ip; } static bool hash_ip4_data_list(struct sk_buff *skb, const struct hash_ip4_elem *e) { if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, e->ip)) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_ip4_data_next(struct hash_ip4_elem *next, const struct hash_ip4_elem *e) { next->ip = e->ip; } #define MTYPE hash_ip4 #define HOST_MASK 32 #include "ip_set_hash_gen.h" static int hash_ip4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { const struct hash_ip4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ip4_elem e = { 0 }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); __be32 ip; ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &ip); ip &= h->bitmask.ip; if (ip == 0) return -EINVAL; e.ip = ip; return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { struct hash_ip4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ip4_elem e = { 0 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0, ip_to = 0, hosts, i = 0; int ret = 0; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_IP])) return -IPSET_ERR_PROTOCOL; ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; ip &= ntohl(h->bitmask.ip); e.ip = htonl(ip); if (e.ip == 0) return -IPSET_ERR_HASH_ELEM; if (adt == IPSET_TEST) return adtfn(set, &e, &ext, &ext, flags); ip_to = ip; if (tb[IPSET_ATTR_IP_TO]) { ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); if (ret) return ret; if (ip > ip_to) { if (ip_to == 0) return -IPSET_ERR_HASH_ELEM; swap(ip, ip_to); } } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(ip, ip_to, cidr); } hosts = h->netmask == 32 ? 1 : 2 << (32 - h->netmask - 1); if (retried) ip = ntohl(h->next.ip); for (; ip <= ip_to; i++) { e.ip = htonl(ip); if (i > IPSET_MAX_RANGE) { hash_ip4_data_next(&h->next, &e); return -ERANGE; } ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; ip += hosts; if (ip == 0) return 0; ret = 0; } return ret; } /* IPv6 variant */ /* Member elements */ struct hash_ip6_elem { union nf_inet_addr ip; }; /* Common functions */ static bool hash_ip6_data_equal(const struct hash_ip6_elem *ip1, const struct hash_ip6_elem *ip2, u32 *multi) { return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6); } static bool hash_ip6_data_list(struct sk_buff *skb, const struct hash_ip6_elem *e) { if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &e->ip.in6)) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_ip6_data_next(struct hash_ip6_elem *next, const struct hash_ip6_elem *e) { } #undef MTYPE #undef HOST_MASK #define MTYPE hash_ip6 #define HOST_MASK 128 #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" static int hash_ip6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { const struct hash_ip6 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ip6_elem e = { { .all = { 0 } } }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); nf_inet_addr_mask_inplace(&e.ip, &h->bitmask); if (ipv6_addr_any(&e.ip.in6)) return -EINVAL; return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ip6_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { const struct hash_ip6 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ip6_elem e = { { .all = { 0 } } }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_IP])) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; if (unlikely(tb[IPSET_ATTR_CIDR])) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (cidr != HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; nf_inet_addr_mask_inplace(&e.ip, &h->bitmask); if (ipv6_addr_any(&e.ip.in6)) return -IPSET_ERR_HASH_ELEM; ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_eexist(ret, flags) ? 0 : ret; } static struct ip_set_type hash_ip_type __read_mostly = { .name = "hash:ip", .protocol = IPSET_PROTOCOL, .features = IPSET_TYPE_IP, .dimension = IPSET_DIM_ONE, .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_ip_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_NETMASK] = { .type = NLA_U8 }, [IPSET_ATTR_BITMASK] = { .type = NLA_NESTED }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; static int __init hash_ip_init(void) { return ip_set_type_register(&hash_ip_type); } static void __exit hash_ip_fini(void) { rcu_barrier(); ip_set_type_unregister(&hash_ip_type); } module_init(hash_ip_init); module_exit(hash_ip_fini);
17 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef BLK_MQ_SCHED_H #define BLK_MQ_SCHED_H #include "elevator.h" #include "blk-mq.h" #define MAX_SCHED_RQ (16 * BLKDEV_DEFAULT_RQ) bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs, struct request **merged_request); bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs); bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq, struct list_head *free); void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx); void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx); void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx); int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e, struct elevator_resources *res); void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e); void blk_mq_sched_free_rqs(struct request_queue *q); struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set, unsigned int nr_hw_queues, unsigned int nr_requests); int blk_mq_alloc_sched_res(struct request_queue *q, struct elevator_type *type, struct elevator_resources *res, unsigned int nr_hw_queues); int blk_mq_alloc_sched_res_batch(struct xarray *elv_tbl, struct blk_mq_tag_set *set, unsigned int nr_hw_queues); int blk_mq_alloc_sched_ctx_batch(struct xarray *elv_tbl, struct blk_mq_tag_set *set); void blk_mq_free_sched_ctx_batch(struct xarray *elv_tbl); void blk_mq_free_sched_tags(struct elevator_tags *et, struct blk_mq_tag_set *set); void blk_mq_free_sched_res(struct elevator_resources *res, struct elevator_type *type, struct blk_mq_tag_set *set); void blk_mq_free_sched_res_batch(struct xarray *et_table, struct blk_mq_tag_set *set); /* * blk_mq_alloc_sched_data() - Allocates scheduler specific data * Returns: * - Pointer to allocated data on success * - NULL if no allocation needed * - ERR_PTR(-ENOMEM) in case of failure */ static inline void *blk_mq_alloc_sched_data(struct request_queue *q, struct elevator_type *e) { void *sched_data; if (!e || !e->ops.alloc_sched_data) return NULL; sched_data = e->ops.alloc_sched_data(q); return (sched_data) ?: ERR_PTR(-ENOMEM); } static inline void blk_mq_free_sched_data(struct elevator_type *e, void *data) { if (e && e->ops.free_sched_data) e->ops.free_sched_data(data); } static inline void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) { if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) __blk_mq_sched_restart(hctx); } static inline bool bio_mergeable(struct bio *bio) { return !(bio->bi_opf & REQ_NOMERGE_FLAGS); } static inline bool blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq, struct bio *bio) { if (rq->rq_flags & RQF_USE_SCHED) { struct elevator_queue *e = q->elevator; if (e->type->ops.allow_merge) return e->type->ops.allow_merge(q, rq, bio); } return true; } static inline void blk_mq_sched_completed_request(struct request *rq, u64 now) { if (rq->rq_flags & RQF_USE_SCHED) { struct elevator_queue *e = rq->q->elevator; if (e->type->ops.completed_request) e->type->ops.completed_request(rq, now); } } static inline void blk_mq_sched_requeue_request(struct request *rq) { if (rq->rq_flags & RQF_USE_SCHED) { struct request_queue *q = rq->q; struct elevator_queue *e = q->elevator; if (e->type->ops.requeue_request) e->type->ops.requeue_request(rq); } } static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx) { struct elevator_queue *e = hctx->queue->elevator; if (e && e->type->ops.has_work) return e->type->ops.has_work(hctx); return false; } static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx) { return test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); } static inline void blk_mq_set_min_shallow_depth(struct request_queue *q, unsigned int depth) { struct blk_mq_hw_ctx *hctx; unsigned long i; queue_for_each_hw_ctx(q, hctx, i) sbitmap_queue_min_shallow_depth(&hctx->sched_tags->bitmap_tags, depth); } #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_STRING_H_ #define _LINUX_STRING_H_ #include <linux/args.h> #include <linux/array_size.h> #include <linux/cleanup.h> /* for DEFINE_FREE() */ #include <linux/compiler.h> /* for inline */ #include <linux/types.h> /* for size_t */ #include <linux/stddef.h> /* for NULL */ #include <linux/err.h> /* for ERR_PTR() */ #include <linux/errno.h> /* for E2BIG */ #include <linux/overflow.h> /* for check_mul_overflow() */ #include <linux/stdarg.h> #include <uapi/linux/string.h> extern char *strndup_user(const char __user *, long); extern void *memdup_user(const void __user *, size_t) __realloc_size(2); extern void *vmemdup_user(const void __user *, size_t) __realloc_size(2); extern void *memdup_user_nul(const void __user *, size_t); /** * memdup_array_user - duplicate array from user space * @src: source address in user space * @n: number of array members to copy * @size: size of one array member * * Return: an ERR_PTR() on failure. Result is physically * contiguous, to be freed by kfree(). */ static inline __realloc_size(2, 3) void *memdup_array_user(const void __user *src, size_t n, size_t size) { size_t nbytes; if (check_mul_overflow(n, size, &nbytes)) return ERR_PTR(-EOVERFLOW); return memdup_user(src, nbytes); } /** * vmemdup_array_user - duplicate array from user space * @src: source address in user space * @n: number of array members to copy * @size: size of one array member * * Return: an ERR_PTR() on failure. Result may be not * physically contiguous. Use kvfree() to free. */ static inline __realloc_size(2, 3) void *vmemdup_array_user(const void __user *src, size_t n, size_t size) { size_t nbytes; if (check_mul_overflow(n, size, &nbytes)) return ERR_PTR(-EOVERFLOW); return vmemdup_user(src, nbytes); } /* * Include machine specific inline routines */ #include <asm/string.h> #ifndef __HAVE_ARCH_STRCPY extern char * strcpy(char *,const char *); #endif #ifndef __HAVE_ARCH_STRNCPY extern char * strncpy(char *,const char *, __kernel_size_t); #endif ssize_t sized_strscpy(char *, const char *, size_t); /* * The 2 argument style can only be used when dst is an array with a * known size. */ #define __strscpy0(dst, src, ...) \ sized_strscpy(dst, src, sizeof(dst) + __must_be_array(dst) + \ __must_be_cstr(dst) + __must_be_cstr(src)) #define __strscpy1(dst, src, size) \ sized_strscpy(dst, src, size + __must_be_cstr(dst) + __must_be_cstr(src)) #define __strscpy_pad0(dst, src, ...) \ sized_strscpy_pad(dst, src, sizeof(dst) + __must_be_array(dst) + \ __must_be_cstr(dst) + __must_be_cstr(src)) #define __strscpy_pad1(dst, src, size) \ sized_strscpy_pad(dst, src, size + __must_be_cstr(dst) + __must_be_cstr(src)) /** * strscpy - Copy a C-string into a sized buffer * @dst: Where to copy the string to * @src: Where to copy the string from * @...: Size of destination buffer (optional) * * Copy the source string @src, or as much of it as fits, into the * destination @dst buffer. The behavior is undefined if the string * buffers overlap. The destination @dst buffer is always NUL terminated, * unless it's zero-sized. * * The size argument @... is only required when @dst is not an array, or * when the copy needs to be smaller than sizeof(@dst). * * Preferred to strncpy() since it always returns a valid string, and * doesn't unnecessarily force the tail of the destination buffer to be * zero padded. If padding is desired please use strscpy_pad(). * * Returns the number of characters copied in @dst (not including the * trailing %NUL) or -E2BIG if @size is 0 or the copy from @src was * truncated. */ #define strscpy(dst, src, ...) \ CONCATENATE(__strscpy, COUNT_ARGS(__VA_ARGS__))(dst, src, __VA_ARGS__) #define sized_strscpy_pad(dest, src, count) ({ \ char *__dst = (dest); \ const char *__src = (src); \ const size_t __count = (count); \ ssize_t __wrote; \ \ __wrote = sized_strscpy(__dst, __src, __count); \ if (__wrote >= 0 && __wrote < __count) \ memset(__dst + __wrote + 1, 0, __count - __wrote - 1); \ __wrote; \ }) /** * strscpy_pad() - Copy a C-string into a sized buffer * @dst: Where to copy the string to * @src: Where to copy the string from * @...: Size of destination buffer * * Copy the string, or as much of it as fits, into the dest buffer. The * behavior is undefined if the string buffers overlap. The destination * buffer is always %NUL terminated, unless it's zero-sized. * * If the source string is shorter than the destination buffer, the * remaining bytes in the buffer will be filled with %NUL bytes. * * For full explanation of why you may want to consider using the * 'strscpy' functions please see the function docstring for strscpy(). * * Returns: * * The number of characters copied (not including the trailing %NULs) * * -E2BIG if count is 0 or @src was truncated. */ #define strscpy_pad(dst, src, ...) \ CONCATENATE(__strscpy_pad, COUNT_ARGS(__VA_ARGS__))(dst, src, __VA_ARGS__) #ifndef __HAVE_ARCH_STRCAT extern char * strcat(char *, const char *); #endif #ifndef __HAVE_ARCH_STRNCAT extern char * strncat(char *, const char *, __kernel_size_t); #endif #ifndef __HAVE_ARCH_STRLCAT extern size_t strlcat(char *, const char *, __kernel_size_t); #endif #ifndef __HAVE_ARCH_STRCMP extern int strcmp(const char *,const char *); #endif #ifndef __HAVE_ARCH_STRNCMP extern int strncmp(const char *,const char *,__kernel_size_t); #endif #ifndef __HAVE_ARCH_STRCASECMP extern int strcasecmp(const char *s1, const char *s2); #endif #ifndef __HAVE_ARCH_STRNCASECMP extern int strncasecmp(const char *s1, const char *s2, size_t n); #endif #ifndef __HAVE_ARCH_STRCHR extern char * strchr(const char *,int); #endif #ifndef __HAVE_ARCH_STRCHRNUL extern char * strchrnul(const char *,int); #endif extern char * strnchrnul(const char *, size_t, int); #ifndef __HAVE_ARCH_STRNCHR extern char * strnchr(const char *, size_t, int); #endif #ifndef __HAVE_ARCH_STRRCHR extern char * strrchr(const char *,int); #endif extern char * __must_check skip_spaces(const char *); extern char *strim(char *); static inline __must_check char *strstrip(char *str) { return strim(str); } #ifndef __HAVE_ARCH_STRSTR extern char * strstr(const char *, const char *); #endif #ifndef __HAVE_ARCH_STRNSTR extern char * strnstr(const char *, const char *, size_t); #endif #ifndef __HAVE_ARCH_STRLEN extern __kernel_size_t strlen(const char *); #endif #ifndef __HAVE_ARCH_STRNLEN extern __kernel_size_t strnlen(const char *,__kernel_size_t); #endif #ifndef __HAVE_ARCH_STRPBRK extern char * strpbrk(const char *,const char *); #endif #ifndef __HAVE_ARCH_STRSEP extern char * strsep(char **,const char *); #endif #ifndef __HAVE_ARCH_STRSPN extern __kernel_size_t strspn(const char *,const char *); #endif #ifndef __HAVE_ARCH_STRCSPN extern __kernel_size_t strcspn(const char *,const char *); #endif #ifndef __HAVE_ARCH_MEMSET extern void * memset(void *,int,__kernel_size_t); #endif #ifndef __HAVE_ARCH_MEMSET16 extern void *memset16(uint16_t *, uint16_t, __kernel_size_t); #endif #ifndef __HAVE_ARCH_MEMSET32 extern void *memset32(uint32_t *, uint32_t, __kernel_size_t); #endif #ifndef __HAVE_ARCH_MEMSET64 extern void *memset64(uint64_t *, uint64_t, __kernel_size_t); #endif static inline void *memset_l(unsigned long *p, unsigned long v, __kernel_size_t n) { if (BITS_PER_LONG == 32) return memset32((uint32_t *)p, v, n); else return memset64((uint64_t *)p, v, n); } static inline void *memset_p(void **p, void *v, __kernel_size_t n) { if (BITS_PER_LONG == 32) return memset32((uint32_t *)p, (uintptr_t)v, n); else return memset64((uint64_t *)p, (uintptr_t)v, n); } extern void **__memcat_p(void **a, void **b); #define memcat_p(a, b) ({ \ BUILD_BUG_ON_MSG(!__same_type(*(a), *(b)), \ "type mismatch in memcat_p()"); \ (typeof(*a) *)__memcat_p((void **)(a), (void **)(b)); \ }) #ifndef __HAVE_ARCH_MEMCPY extern void * memcpy(void *,const void *,__kernel_size_t); #endif #ifndef __HAVE_ARCH_MEMMOVE extern void * memmove(void *,const void *,__kernel_size_t); #endif #ifndef __HAVE_ARCH_MEMSCAN extern void * memscan(void *,int,__kernel_size_t); #endif #ifndef __HAVE_ARCH_MEMCMP extern int memcmp(const void *,const void *,__kernel_size_t); #endif #ifndef __HAVE_ARCH_BCMP extern int bcmp(const void *,const void *,__kernel_size_t); #endif #ifndef __HAVE_ARCH_MEMCHR extern void * memchr(const void *,int,__kernel_size_t); #endif #ifndef __HAVE_ARCH_MEMCPY_FLUSHCACHE static inline void memcpy_flushcache(void *dst, const void *src, size_t cnt) { memcpy(dst, src, cnt); } #endif void *memchr_inv(const void *s, int c, size_t n); char *strreplace(char *str, char old, char new); /** * mem_is_zero - Check if an area of memory is all 0's. * @s: The memory area * @n: The size of the area * * Return: True if the area of memory is all 0's. */ static inline bool mem_is_zero(const void *s, size_t n) { return !memchr_inv(s, 0, n); } extern void kfree_const(const void *x); extern char *kstrdup(const char *s, gfp_t gfp) __malloc; extern const char *kstrdup_const(const char *s, gfp_t gfp); extern char *kstrndup(const char *s, size_t len, gfp_t gfp); extern void *kmemdup_noprof(const void *src, size_t len, gfp_t gfp) __realloc_size(2); #define kmemdup(...) alloc_hooks(kmemdup_noprof(__VA_ARGS__)) extern void *kvmemdup(const void *src, size_t len, gfp_t gfp) __realloc_size(2); extern char *kmemdup_nul(const char *s, size_t len, gfp_t gfp); extern void *kmemdup_array(const void *src, size_t count, size_t element_size, gfp_t gfp) __realloc_size(2, 3); /* lib/argv_split.c */ extern char **argv_split(gfp_t gfp, const char *str, int *argcp); extern void argv_free(char **argv); DEFINE_FREE(argv_free, char **, if (!IS_ERR_OR_NULL(_T)) argv_free(_T)) /* lib/cmdline.c */ extern int get_option(char **str, int *pint); extern char *get_options(const char *str, int nints, int *ints); extern unsigned long long memparse(const char *ptr, char **retptr); extern bool parse_option_str(const char *str, const char *option); extern char *next_arg(char *args, char **param, char **val); extern bool sysfs_streq(const char *s1, const char *s2); int match_string(const char * const *array, size_t n, const char *string); int __sysfs_match_string(const char * const *array, size_t n, const char *s); /** * sysfs_match_string - matches given string in an array * @_a: array of strings * @_s: string to match with * * Helper for __sysfs_match_string(). Calculates the size of @a automatically. */ #define sysfs_match_string(_a, _s) __sysfs_match_string(_a, ARRAY_SIZE(_a), _s) #ifdef CONFIG_BINARY_PRINTF __printf(3, 0) int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args); __printf(3, 0) int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf); #endif extern ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos, const void *from, size_t available); int ptr_to_hashval(const void *ptr, unsigned long *hashval_out); size_t memweight(const void *ptr, size_t bytes); /** * memzero_explicit - Fill a region of memory (e.g. sensitive * keying data) with 0s. * @s: Pointer to the start of the area. * @count: The size of the area. * * Note: usually using memset() is just fine (!), but in cases * where clearing out _local_ data at the end of a scope is * necessary, memzero_explicit() should be used instead in * order to prevent the compiler from optimising away zeroing. * * memzero_explicit() doesn't need an arch-specific version as * it just invokes the one of memset() implicitly. */ static inline void memzero_explicit(void *s, size_t count) { memset(s, 0, count); barrier_data(s); } /** * kbasename - return the last part of a pathname. * * @path: path to extract the filename from. * * Returns: * Pointer to the filename portion inside @path. If no '/' exists, * returns @path unchanged. */ static inline const char *kbasename(const char *path) { const char *tail = strrchr(path, '/'); return tail ? tail + 1 : path; } #if !defined(__NO_FORTIFY) && defined(__OPTIMIZE__) && defined(CONFIG_FORTIFY_SOURCE) #include <linux/fortify-string.h> #endif #ifndef unsafe_memcpy #define unsafe_memcpy(dst, src, bytes, justification) \ memcpy(dst, src, bytes) #endif void memcpy_and_pad(void *dest, size_t dest_len, const void *src, size_t count, int pad); /** * strtomem_pad - Copy NUL-terminated string to non-NUL-terminated buffer * * @dest: Pointer of destination character array (marked as __nonstring) * @src: Pointer to NUL-terminated string * @pad: Padding character to fill any remaining bytes of @dest after copy * * This is a replacement for strncpy() uses where the destination is not * a NUL-terminated string, but with bounds checking on the source size, and * an explicit padding character. If padding is not required, use strtomem(). * * Note that the size of @dest is not an argument, as the length of @dest * must be discoverable by the compiler. */ #define strtomem_pad(dest, src, pad) do { \ const size_t _dest_len = __must_be_byte_array(dest) + \ __must_be_noncstr(dest) + \ ARRAY_SIZE(dest); \ const size_t _src_len = __must_be_cstr(src) + \ __builtin_object_size(src, 1); \ \ BUILD_BUG_ON(!__builtin_constant_p(_dest_len) || \ _dest_len == (size_t)-1); \ memcpy_and_pad(dest, _dest_len, src, \ strnlen(src, min(_src_len, _dest_len)), pad); \ } while (0) /** * strtomem - Copy NUL-terminated string to non-NUL-terminated buffer * * @dest: Pointer of destination character array (marked as __nonstring) * @src: Pointer to NUL-terminated string * * This is a replacement for strncpy() uses where the destination is not * a NUL-terminated string, but with bounds checking on the source size, and * without trailing padding. If padding is required, use strtomem_pad(). * * Note that the size of @dest is not an argument, as the length of @dest * must be discoverable by the compiler. */ #define strtomem(dest, src) do { \ const size_t _dest_len = __must_be_byte_array(dest) + \ __must_be_noncstr(dest) + \ ARRAY_SIZE(dest); \ const size_t _src_len = __must_be_cstr(src) + \ __builtin_object_size(src, 1); \ \ BUILD_BUG_ON(!__builtin_constant_p(_dest_len) || \ _dest_len == (size_t)-1); \ memcpy(dest, src, strnlen(src, min(_src_len, _dest_len))); \ } while (0) /** * memtostr - Copy a possibly non-NUL-term string to a NUL-term string * @dest: Pointer to destination NUL-terminates string * @src: Pointer to character array (likely marked as __nonstring) * * This is a replacement for strncpy() uses where the source is not * a NUL-terminated string. * * Note that sizes of @dest and @src must be known at compile-time. */ #define memtostr(dest, src) do { \ const size_t _dest_len = __must_be_byte_array(dest) + \ __must_be_cstr(dest) + \ ARRAY_SIZE(dest); \ const size_t _src_len = __must_be_noncstr(src) + \ __builtin_object_size(src, 1); \ const size_t _src_chars = strnlen(src, _src_len); \ const size_t _copy_len = min(_dest_len - 1, _src_chars); \ \ BUILD_BUG_ON(!__builtin_constant_p(_dest_len) || \ !__builtin_constant_p(_src_len) || \ _dest_len == 0 || _dest_len == (size_t)-1 || \ _src_len == 0 || _src_len == (size_t)-1); \ memcpy(dest, src, _copy_len); \ dest[_copy_len] = '\0'; \ } while (0) /** * memtostr_pad - Copy a possibly non-NUL-term string to a NUL-term string * with NUL padding in the destination * @dest: Pointer to destination NUL-terminates string * @src: Pointer to character array (likely marked as __nonstring) * * This is a replacement for strncpy() uses where the source is not * a NUL-terminated string. * * Note that sizes of @dest and @src must be known at compile-time. */ #define memtostr_pad(dest, src) do { \ const size_t _dest_len = __must_be_byte_array(dest) + \ __must_be_cstr(dest) + \ ARRAY_SIZE(dest); \ const size_t _src_len = __must_be_noncstr(src) + \ __builtin_object_size(src, 1); \ const size_t _src_chars = strnlen(src, _src_len); \ const size_t _copy_len = min(_dest_len - 1, _src_chars); \ \ BUILD_BUG_ON(!__builtin_constant_p(_dest_len) || \ !__builtin_constant_p(_src_len) || \ _dest_len == 0 || _dest_len == (size_t)-1 || \ _src_len == 0 || _src_len == (size_t)-1); \ memcpy(dest, src, _copy_len); \ memset(&dest[_copy_len], 0, _dest_len - _copy_len); \ } while (0) /** * memset_after - Set a value after a struct member to the end of a struct * * @obj: Address of target struct instance * @v: Byte value to repeatedly write * @member: after which struct member to start writing bytes * * This is good for clearing padding following the given member. */ #define memset_after(obj, v, member) \ ({ \ u8 *__ptr = (u8 *)(obj); \ typeof(v) __val = (v); \ memset(__ptr + offsetofend(typeof(*(obj)), member), __val, \ sizeof(*(obj)) - offsetofend(typeof(*(obj)), member)); \ }) /** * memset_startat - Set a value starting at a member to the end of a struct * * @obj: Address of target struct instance * @v: Byte value to repeatedly write * @member: struct member to start writing at * * Note that if there is padding between the prior member and the target * member, memset_after() should be used to clear the prior padding. */ #define memset_startat(obj, v, member) \ ({ \ u8 *__ptr = (u8 *)(obj); \ typeof(v) __val = (v); \ memset(__ptr + offsetof(typeof(*(obj)), member), __val, \ sizeof(*(obj)) - offsetof(typeof(*(obj)), member)); \ }) /** * str_has_prefix - Test if a string has a given prefix * @str: The string to test * @prefix: The string to see if @str starts with * * A common way to test a prefix of a string is to do: * strncmp(str, prefix, sizeof(prefix) - 1) * * But this can lead to bugs due to typos, or if prefix is a pointer * and not a constant. Instead use str_has_prefix(). * * Returns: * * strlen(@prefix) if @str starts with @prefix * * 0 if @str does not start with @prefix */ static __always_inline size_t str_has_prefix(const char *str, const char *prefix) { size_t len = strlen(prefix); return strncmp(str, prefix, len) == 0 ? len : 0; } /** * strstarts - does @str start with @prefix? * @str: string to examine * @prefix: prefix to look for. * * Returns: * True if @str begins with @prefix. False in all other cases. */ static inline bool strstarts(const char *str, const char *prefix) { return strncmp(str, prefix, strlen(prefix)) == 0; } /** * strends - Check if a string ends with another string. * @str: NULL-terminated string to check against @suffix * @suffix: NULL-terminated string defining the suffix to look for in @str * * Returns: * True if @str ends with @suffix. False in all other cases. */ static inline bool __attribute__((nonnull(1, 2))) strends(const char *str, const char *suffix) { unsigned int str_len = strlen(str), suffix_len = strlen(suffix); if (str_len < suffix_len) return false; return !(strcmp(str + str_len - suffix_len, suffix)); } #endif /* _LINUX_STRING_H_ */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/ipv6.h> #include <net/dsfield.h> #include <net/xfrm.h> #ifndef XFRM_INOUT_H #define XFRM_INOUT_H 1 static inline void xfrm4_extract_header(struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); XFRM_MODE_SKB_CB(skb)->ihl = sizeof(*iph); XFRM_MODE_SKB_CB(skb)->id = iph->id; XFRM_MODE_SKB_CB(skb)->frag_off = iph->frag_off; XFRM_MODE_SKB_CB(skb)->tos = iph->tos; XFRM_MODE_SKB_CB(skb)->ttl = iph->ttl; XFRM_MODE_SKB_CB(skb)->optlen = iph->ihl * 4 - sizeof(*iph); memset(XFRM_MODE_SKB_CB(skb)->flow_lbl, 0, sizeof(XFRM_MODE_SKB_CB(skb)->flow_lbl)); } static inline void xfrm6_extract_header(struct sk_buff *skb) { #if IS_ENABLED(CONFIG_IPV6) struct ipv6hdr *iph = ipv6_hdr(skb); XFRM_MODE_SKB_CB(skb)->ihl = sizeof(*iph); XFRM_MODE_SKB_CB(skb)->id = 0; XFRM_MODE_SKB_CB(skb)->frag_off = htons(IP_DF); XFRM_MODE_SKB_CB(skb)->tos = ipv6_get_dsfield(iph); XFRM_MODE_SKB_CB(skb)->ttl = iph->hop_limit; XFRM_MODE_SKB_CB(skb)->optlen = 0; memcpy(XFRM_MODE_SKB_CB(skb)->flow_lbl, iph->flow_lbl, sizeof(XFRM_MODE_SKB_CB(skb)->flow_lbl)); #else WARN_ON_ONCE(1); #endif } static inline void xfrm6_beet_make_header(struct sk_buff *skb) { struct ipv6hdr *iph = ipv6_hdr(skb); iph->version = 6; memcpy(iph->flow_lbl, XFRM_MODE_SKB_CB(skb)->flow_lbl, sizeof(iph->flow_lbl)); iph->nexthdr = XFRM_MODE_SKB_CB(skb)->protocol; ipv6_change_dsfield(iph, 0, XFRM_MODE_SKB_CB(skb)->tos); iph->hop_limit = XFRM_MODE_SKB_CB(skb)->ttl; } static inline void xfrm4_beet_make_header(struct sk_buff *skb) { struct iphdr *iph = ip_hdr(skb); iph->ihl = 5; iph->version = 4; iph->protocol = XFRM_MODE_SKB_CB(skb)->protocol; iph->tos = XFRM_MODE_SKB_CB(skb)->tos; iph->id = XFRM_MODE_SKB_CB(skb)->id; iph->frag_off = XFRM_MODE_SKB_CB(skb)->frag_off; iph->ttl = XFRM_MODE_SKB_CB(skb)->ttl; } #endif
401 16316 12904 13291 44 13400 5398 9106 230 229 3234 3232 36 175 25 20 90 322 249 9 421 493 142 3518 2032 505 134 1946 8361 6932 9 7 1196 212 32 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_LIST_H #define _LINUX_LIST_H #include <linux/container_of.h> #include <linux/types.h> #include <linux/stddef.h> #include <linux/poison.h> #include <linux/const.h> #include <asm/barrier.h> /* * Circular doubly linked list implementation. * * Some of the internal functions ("__xxx") are useful when * manipulating whole lists rather than single entries, as * sometimes we already know the next/prev entries and we can * generate better code by using them directly rather than * using the generic single-entry routines. */ /** * LIST_HEAD_INIT - initialize a &struct list_head's links to point to itself * @name: name of the list_head */ #define LIST_HEAD_INIT(name) { &(name), &(name) } /** * LIST_HEAD - definition of a &struct list_head with initialization values * @name: name of the list_head */ #define LIST_HEAD(name) \ struct list_head name = LIST_HEAD_INIT(name) /** * INIT_LIST_HEAD - Initialize a list_head structure * @list: list_head structure to be initialized. * * Initializes the list_head to point to itself. If it is a list header, * the result is an empty list. */ static inline void INIT_LIST_HEAD(struct list_head *list) { WRITE_ONCE(list->next, list); WRITE_ONCE(list->prev, list); } #ifdef CONFIG_LIST_HARDENED #ifdef CONFIG_DEBUG_LIST # define __list_valid_slowpath #else # define __list_valid_slowpath __cold __preserve_most #endif /* * Performs the full set of list corruption checks before __list_add(). * On list corruption reports a warning, and returns false. */ bool __list_valid_slowpath __list_add_valid_or_report(struct list_head *new, struct list_head *prev, struct list_head *next); /* * Performs list corruption checks before __list_add(). Returns false if a * corruption is detected, true otherwise. * * With CONFIG_LIST_HARDENED only, performs minimal list integrity checking * inline to catch non-faulting corruptions, and only if a corruption is * detected calls the reporting function __list_add_valid_or_report(). */ static __always_inline bool __list_add_valid(struct list_head *new, struct list_head *prev, struct list_head *next) { bool ret = true; if (!IS_ENABLED(CONFIG_DEBUG_LIST)) { /* * With the hardening version, elide checking if next and prev * are NULL, since the immediate dereference of them below would * result in a fault if NULL. * * With the reduced set of checks, we can afford to inline the * checks, which also gives the compiler a chance to elide some * of them completely if they can be proven at compile-time. If * one of the pre-conditions does not hold, the slow-path will * show a report which pre-condition failed. */ if (likely(next->prev == prev && prev->next == next && new != prev && new != next)) return true; ret = false; } ret &= __list_add_valid_or_report(new, prev, next); return ret; } /* * Performs the full set of list corruption checks before __list_del_entry(). * On list corruption reports a warning, and returns false. */ bool __list_valid_slowpath __list_del_entry_valid_or_report(struct list_head *entry); /* * Performs list corruption checks before __list_del_entry(). Returns false if a * corruption is detected, true otherwise. * * With CONFIG_LIST_HARDENED only, performs minimal list integrity checking * inline to catch non-faulting corruptions, and only if a corruption is * detected calls the reporting function __list_del_entry_valid_or_report(). */ static __always_inline bool __list_del_entry_valid(struct list_head *entry) { bool ret = true; if (!IS_ENABLED(CONFIG_DEBUG_LIST)) { struct list_head *prev = entry->prev; struct list_head *next = entry->next; /* * With the hardening version, elide checking if next and prev * are NULL, LIST_POISON1 or LIST_POISON2, since the immediate * dereference of them below would result in a fault. */ if (likely(prev->next == entry && next->prev == entry)) return true; ret = false; } ret &= __list_del_entry_valid_or_report(entry); return ret; } #else static inline bool __list_add_valid(struct list_head *new, struct list_head *prev, struct list_head *next) { return true; } static inline bool __list_del_entry_valid(struct list_head *entry) { return true; } #endif /* * Insert a new entry between two known consecutive entries. * * This is only for internal list manipulation where we know * the prev/next entries already! */ static inline void __list_add(struct list_head *new, struct list_head *prev, struct list_head *next) { if (!__list_add_valid(new, prev, next)) return; next->prev = new; new->next = next; new->prev = prev; WRITE_ONCE(prev->next, new); } /** * list_add - add a new entry * @new: new entry to be added * @head: list head to add it after * * Insert a new entry after the specified head. * This is good for implementing stacks. */ static inline void list_add(struct list_head *new, struct list_head *head) { __list_add(new, head, head->next); } /** * list_add_tail - add a new entry * @new: new entry to be added * @head: list head to add it before * * Insert a new entry before the specified head. * This is useful for implementing queues. */ static inline void list_add_tail(struct list_head *new, struct list_head *head) { __list_add(new, head->prev, head); } /* * Delete a list entry by making the prev/next entries * point to each other. * * This is only for internal list manipulation where we know * the prev/next entries already! */ static inline void __list_del(struct list_head * prev, struct list_head * next) { next->prev = prev; WRITE_ONCE(prev->next, next); } /* * Delete a list entry and clear the 'prev' pointer. * * This is a special-purpose list clearing method used in the networking code * for lists allocated as per-cpu, where we don't want to incur the extra * WRITE_ONCE() overhead of a regular list_del_init(). The code that uses this * needs to check the node 'prev' pointer instead of calling list_empty(). */ static inline void __list_del_clearprev(struct list_head *entry) { __list_del(entry->prev, entry->next); entry->prev = NULL; } static inline void __list_del_entry(struct list_head *entry) { if (!__list_del_entry_valid(entry)) return; __list_del(entry->prev, entry->next); } /** * list_del - deletes entry from list. * @entry: the element to delete from the list. * Note: list_empty() on entry does not return true after this, the entry is * in an undefined state. */ static inline void list_del(struct list_head *entry) { __list_del_entry(entry); entry->next = LIST_POISON1; entry->prev = LIST_POISON2; } /** * list_replace - replace old entry by new one * @old : the element to be replaced * @new : the new element to insert * * If @old was empty, it will be overwritten. */ static inline void list_replace(struct list_head *old, struct list_head *new) { new->next = old->next; new->next->prev = new; new->prev = old->prev; new->prev->next = new; } /** * list_replace_init - replace old entry by new one and initialize the old one * @old : the element to be replaced * @new : the new element to insert * * If @old was empty, it will be overwritten. */ static inline void list_replace_init(struct list_head *old, struct list_head *new) { list_replace(old, new); INIT_LIST_HEAD(old); } /** * list_swap - replace entry1 with entry2 and re-add entry1 at entry2's position * @entry1: the location to place entry2 * @entry2: the location to place entry1 */ static inline void list_swap(struct list_head *entry1, struct list_head *entry2) { struct list_head *pos = entry2->prev; list_del(entry2); list_replace(entry1, entry2); if (pos == entry1) pos = entry2; list_add(entry1, pos); } /** * list_del_init - deletes entry from list and reinitialize it. * @entry: the element to delete from the list. */ static inline void list_del_init(struct list_head *entry) { __list_del_entry(entry); INIT_LIST_HEAD(entry); } /** * list_move - delete from one list and add as another's head * @list: the entry to move * @head: the head that will precede our entry */ static inline void list_move(struct list_head *list, struct list_head *head) { __list_del_entry(list); list_add(list, head); } /** * list_move_tail - delete from one list and add as another's tail * @list: the entry to move * @head: the head that will follow our entry */ static inline void list_move_tail(struct list_head *list, struct list_head *head) { __list_del_entry(list); list_add_tail(list, head); } /** * list_bulk_move_tail - move a subsection of a list to its tail * @head: the head that will follow our entry * @first: first entry to move * @last: last entry to move, can be the same as first * * Move all entries between @first and including @last before @head. * All three entries must belong to the same linked list. */ static inline void list_bulk_move_tail(struct list_head *head, struct list_head *first, struct list_head *last) { first->prev->next = last->next; last->next->prev = first->prev; head->prev->next = first; first->prev = head->prev; last->next = head; head->prev = last; } /** * list_is_first -- tests whether @list is the first entry in list @head * @list: the entry to test * @head: the head of the list */ static inline int list_is_first(const struct list_head *list, const struct list_head *head) { return list->prev == head; } /** * list_is_last - tests whether @list is the last entry in list @head * @list: the entry to test * @head: the head of the list */ static inline int list_is_last(const struct list_head *list, const struct list_head *head) { return list->next == head; } /** * list_is_head - tests whether @list is the list @head * @list: the entry to test * @head: the head of the list */ static inline int list_is_head(const struct list_head *list, const struct list_head *head) { return list == head; } /** * list_empty - tests whether a list is empty * @head: the list to test. */ static inline int list_empty(const struct list_head *head) { return READ_ONCE(head->next) == head; } /** * list_del_init_careful - deletes entry from list and reinitialize it. * @entry: the element to delete from the list. * * This is the same as list_del_init(), except designed to be used * together with list_empty_careful() in a way to guarantee ordering * of other memory operations. * * Any memory operations done before a list_del_init_careful() are * guaranteed to be visible after a list_empty_careful() test. */ static inline void list_del_init_careful(struct list_head *entry) { __list_del_entry(entry); WRITE_ONCE(entry->prev, entry); smp_store_release(&entry->next, entry); } /** * list_empty_careful - tests whether a list is empty and not being modified * @head: the list to test * * Description: * tests whether a list is empty _and_ checks that no other CPU might be * in the process of modifying either member (next or prev) * * NOTE: using list_empty_careful() without synchronization * can only be safe if the only activity that can happen * to the list entry is list_del_init(). Eg. it cannot be used * if another CPU could re-list_add() it. */ static inline int list_empty_careful(const struct list_head *head) { struct list_head *next = smp_load_acquire(&head->next); return list_is_head(next, head) && (next == READ_ONCE(head->prev)); } /** * list_rotate_left - rotate the list to the left * @head: the head of the list */ static inline void list_rotate_left(struct list_head *head) { struct list_head *first; if (!list_empty(head)) { first = head->next; list_move_tail(first, head); } } /** * list_rotate_to_front() - Rotate list to specific item. * @list: The desired new front of the list. * @head: The head of the list. * * Rotates list so that @list becomes the new front of the list. */ static inline void list_rotate_to_front(struct list_head *list, struct list_head *head) { /* * Deletes the list head from the list denoted by @head and * places it as the tail of @list, this effectively rotates the * list so that @list is at the front. */ list_move_tail(head, list); } /** * list_is_singular - tests whether a list has just one entry. * @head: the list to test. */ static inline int list_is_singular(const struct list_head *head) { return !list_empty(head) && (head->next == head->prev); } static inline void __list_cut_position(struct list_head *list, struct list_head *head, struct list_head *entry) { struct list_head *new_first = entry->next; list->next = head->next; list->next->prev = list; list->prev = entry; entry->next = list; head->next = new_first; new_first->prev = head; } /** * list_cut_position - cut a list into two * @list: a new list to add all removed entries * @head: a list with entries * @entry: an entry within head, could be the head itself * and if so we won't cut the list * * This helper moves the initial part of @head, up to and * including @entry, from @head to @list. You should * pass on @entry an element you know is on @head. @list * should be an empty list or a list you do not care about * losing its data. * */ static inline void list_cut_position(struct list_head *list, struct list_head *head, struct list_head *entry) { if (list_empty(head)) return; if (list_is_singular(head) && !list_is_head(entry, head) && (entry != head->next)) return; if (list_is_head(entry, head)) INIT_LIST_HEAD(list); else __list_cut_position(list, head, entry); } /** * list_cut_before - cut a list into two, before given entry * @list: a new list to add all removed entries * @head: a list with entries * @entry: an entry within head, could be the head itself * * This helper moves the initial part of @head, up to but * excluding @entry, from @head to @list. You should pass * in @entry an element you know is on @head. @list should * be an empty list or a list you do not care about losing * its data. * If @entry == @head, all entries on @head are moved to * @list. */ static inline void list_cut_before(struct list_head *list, struct list_head *head, struct list_head *entry) { if (head->next == entry) { INIT_LIST_HEAD(list); return; } list->next = head->next; list->next->prev = list; list->prev = entry->prev; list->prev->next = list; head->next = entry; entry->prev = head; } static inline void __list_splice(const struct list_head *list, struct list_head *prev, struct list_head *next) { struct list_head *first = list->next; struct list_head *last = list->prev; first->prev = prev; prev->next = first; last->next = next; next->prev = last; } /** * list_splice - join two lists, this is designed for stacks * @list: the new list to add. * @head: the place to add it in the first list. */ static inline void list_splice(const struct list_head *list, struct list_head *head) { if (!list_empty(list)) __list_splice(list, head, head->next); } /** * list_splice_tail - join two lists, each list being a queue * @list: the new list to add. * @head: the place to add it in the first list. */ static inline void list_splice_tail(struct list_head *list, struct list_head *head) { if (!list_empty(list)) __list_splice(list, head->prev, head); } /** * list_splice_init - join two lists and reinitialise the emptied list. * @list: the new list to add. * @head: the place to add it in the first list. * * The list at @list is reinitialised */ static inline void list_splice_init(struct list_head *list, struct list_head *head) { if (!list_empty(list)) { __list_splice(list, head, head->next); INIT_LIST_HEAD(list); } } /** * list_splice_tail_init - join two lists and reinitialise the emptied list * @list: the new list to add. * @head: the place to add it in the first list. * * Each of the lists is a queue. * The list at @list is reinitialised */ static inline void list_splice_tail_init(struct list_head *list, struct list_head *head) { if (!list_empty(list)) { __list_splice(list, head->prev, head); INIT_LIST_HEAD(list); } } /** * list_entry - get the struct for this entry * @ptr: the &struct list_head pointer. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. */ #define list_entry(ptr, type, member) \ container_of(ptr, type, member) /** * list_first_entry - get the first element from a list * @ptr: the list head to take the element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * Note, that list is expected to be not empty. */ #define list_first_entry(ptr, type, member) \ list_entry((ptr)->next, type, member) /** * list_last_entry - get the last element from a list * @ptr: the list head to take the element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * Note, that list is expected to be not empty. */ #define list_last_entry(ptr, type, member) \ list_entry((ptr)->prev, type, member) /** * list_first_entry_or_null - get the first element from a list * @ptr: the list head to take the element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * Note that if the list is empty, it returns NULL. */ #define list_first_entry_or_null(ptr, type, member) ({ \ struct list_head *head__ = (ptr); \ struct list_head *pos__ = READ_ONCE(head__->next); \ pos__ != head__ ? list_entry(pos__, type, member) : NULL; \ }) /** * list_last_entry_or_null - get the last element from a list * @ptr: the list head to take the element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * Note that if the list is empty, it returns NULL. */ #define list_last_entry_or_null(ptr, type, member) ({ \ struct list_head *head__ = (ptr); \ struct list_head *pos__ = READ_ONCE(head__->prev); \ pos__ != head__ ? list_entry(pos__, type, member) : NULL; \ }) /** * list_next_entry - get the next element in list * @pos: the type * to cursor * @member: the name of the list_head within the struct. */ #define list_next_entry(pos, member) \ list_entry((pos)->member.next, typeof(*(pos)), member) /** * list_next_entry_circular - get the next element in list * @pos: the type * to cursor. * @head: the list head to take the element from. * @member: the name of the list_head within the struct. * * Wraparound if pos is the last element (return the first element). * Note, that list is expected to be not empty. */ #define list_next_entry_circular(pos, head, member) \ (list_is_last(&(pos)->member, head) ? \ list_first_entry(head, typeof(*(pos)), member) : list_next_entry(pos, member)) /** * list_prev_entry - get the prev element in list * @pos: the type * to cursor * @member: the name of the list_head within the struct. */ #define list_prev_entry(pos, member) \ list_entry((pos)->member.prev, typeof(*(pos)), member) /** * list_prev_entry_circular - get the prev element in list * @pos: the type * to cursor. * @head: the list head to take the element from. * @member: the name of the list_head within the struct. * * Wraparound if pos is the first element (return the last element). * Note, that list is expected to be not empty. */ #define list_prev_entry_circular(pos, head, member) \ (list_is_first(&(pos)->member, head) ? \ list_last_entry(head, typeof(*(pos)), member) : list_prev_entry(pos, member)) /** * list_for_each - iterate over a list * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. */ #define list_for_each(pos, head) \ for (pos = (head)->next; !list_is_head(pos, (head)); pos = pos->next) /** * list_for_each_continue - continue iteration over a list * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. * * Continue to iterate over a list, continuing after the current position. */ #define list_for_each_continue(pos, head) \ for (pos = pos->next; !list_is_head(pos, (head)); pos = pos->next) /** * list_for_each_prev - iterate over a list backwards * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. */ #define list_for_each_prev(pos, head) \ for (pos = (head)->prev; !list_is_head(pos, (head)); pos = pos->prev) /** * list_for_each_safe - iterate over a list safe against removal of list entry * @pos: the &struct list_head to use as a loop cursor. * @n: another &struct list_head to use as temporary storage * @head: the head for your list. */ #define list_for_each_safe(pos, n, head) \ for (pos = (head)->next, n = pos->next; \ !list_is_head(pos, (head)); \ pos = n, n = pos->next) /** * list_for_each_prev_safe - iterate over a list backwards safe against removal of list entry * @pos: the &struct list_head to use as a loop cursor. * @n: another &struct list_head to use as temporary storage * @head: the head for your list. */ #define list_for_each_prev_safe(pos, n, head) \ for (pos = (head)->prev, n = pos->prev; \ !list_is_head(pos, (head)); \ pos = n, n = pos->prev) /** * list_count_nodes - count nodes in the list * @head: the head for your list. */ static inline size_t list_count_nodes(struct list_head *head) { struct list_head *pos; size_t count = 0; list_for_each(pos, head) count++; return count; } /** * list_entry_is_head - test if the entry points to the head of the list * @pos: the type * to cursor * @head: the head for your list. * @member: the name of the list_head within the struct. */ #define list_entry_is_head(pos, head, member) \ list_is_head(&pos->member, (head)) /** * list_for_each_entry - iterate over list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. */ #define list_for_each_entry(pos, head, member) \ for (pos = list_first_entry(head, typeof(*pos), member); \ !list_entry_is_head(pos, head, member); \ pos = list_next_entry(pos, member)) /** * list_for_each_entry_reverse - iterate backwards over list of given type. * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. */ #define list_for_each_entry_reverse(pos, head, member) \ for (pos = list_last_entry(head, typeof(*pos), member); \ !list_entry_is_head(pos, head, member); \ pos = list_prev_entry(pos, member)) /** * list_prepare_entry - prepare a pos entry for use in list_for_each_entry_continue() * @pos: the type * to use as a start point * @head: the head of the list * @member: the name of the list_head within the struct. * * Prepares a pos entry for use as a start point in list_for_each_entry_continue(). */ #define list_prepare_entry(pos, head, member) \ ((pos) ? : list_entry(head, typeof(*pos), member)) /** * list_for_each_entry_continue - continue iteration over list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * * Continue to iterate over list of given type, continuing after * the current position. */ #define list_for_each_entry_continue(pos, head, member) \ for (pos = list_next_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = list_next_entry(pos, member)) /** * list_for_each_entry_continue_reverse - iterate backwards from the given point * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * * Start to iterate over list of given type backwards, continuing after * the current position. */ #define list_for_each_entry_continue_reverse(pos, head, member) \ for (pos = list_prev_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = list_prev_entry(pos, member)) /** * list_for_each_entry_from - iterate over list of given type from the current point * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate over list of given type, continuing from current position. */ #define list_for_each_entry_from(pos, head, member) \ for (; !list_entry_is_head(pos, head, member); \ pos = list_next_entry(pos, member)) /** * list_for_each_entry_from_reverse - iterate backwards over list of given type * from the current point * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate backwards over list of given type, continuing from current position. */ #define list_for_each_entry_from_reverse(pos, head, member) \ for (; !list_entry_is_head(pos, head, member); \ pos = list_prev_entry(pos, member)) /** * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_head within the struct. */ #define list_for_each_entry_safe(pos, n, head, member) \ for (pos = list_first_entry(head, typeof(*pos), member), \ n = list_next_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = n, n = list_next_entry(n, member)) /** * list_for_each_entry_safe_continue - continue list iteration safe against removal * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate over list of given type, continuing after current point, * safe against removal of list entry. */ #define list_for_each_entry_safe_continue(pos, n, head, member) \ for (pos = list_next_entry(pos, member), \ n = list_next_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = n, n = list_next_entry(n, member)) /** * list_for_each_entry_safe_from - iterate over list from current point safe against removal * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate over list of given type from current point, safe against * removal of list entry. */ #define list_for_each_entry_safe_from(pos, n, head, member) \ for (n = list_next_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = n, n = list_next_entry(n, member)) /** * list_for_each_entry_safe_reverse - iterate backwards over list safe against removal * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate backwards over list of given type, safe against removal * of list entry. */ #define list_for_each_entry_safe_reverse(pos, n, head, member) \ for (pos = list_last_entry(head, typeof(*pos), member), \ n = list_prev_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = n, n = list_prev_entry(n, member)) /** * list_safe_reset_next - reset a stale list_for_each_entry_safe loop * @pos: the loop cursor used in the list_for_each_entry_safe loop * @n: temporary storage used in list_for_each_entry_safe * @member: the name of the list_head within the struct. * * list_safe_reset_next is not safe to use in general if the list may be * modified concurrently (eg. the lock is dropped in the loop body). An * exception to this is if the cursor element (pos) is pinned in the list, * and list_safe_reset_next is called after re-taking the lock and before * completing the current iteration of the loop body. */ #define list_safe_reset_next(pos, n, member) \ n = list_next_entry(pos, member) /* * Double linked lists with a single pointer list head. * Mostly useful for hash tables where the two pointer list head is * too wasteful. * You lose the ability to access the tail in O(1). */ #define HLIST_HEAD_INIT { .first = NULL } #define HLIST_HEAD(name) struct hlist_head name = { .first = NULL } #define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL) static inline void INIT_HLIST_NODE(struct hlist_node *h) { h->next = NULL; h->pprev = NULL; } /** * hlist_unhashed - Has node been removed from list and reinitialized? * @h: Node to be checked * * Not that not all removal functions will leave a node in unhashed * state. For example, hlist_nulls_del_init_rcu() does leave the * node in unhashed state, but hlist_nulls_del() does not. */ static inline int hlist_unhashed(const struct hlist_node *h) { return !h->pprev; } /** * hlist_unhashed_lockless - Version of hlist_unhashed for lockless use * @h: Node to be checked * * This variant of hlist_unhashed() must be used in lockless contexts * to avoid potential load-tearing. The READ_ONCE() is paired with the * various WRITE_ONCE() in hlist helpers that are defined below. */ static inline int hlist_unhashed_lockless(const struct hlist_node *h) { return !READ_ONCE(h->pprev); } /** * hlist_empty - Is the specified hlist_head structure an empty hlist? * @h: Structure to check. */ static inline int hlist_empty(const struct hlist_head *h) { return !READ_ONCE(h->first); } static inline void __hlist_del(struct hlist_node *n) { struct hlist_node *next = n->next; struct hlist_node **pprev = n->pprev; WRITE_ONCE(*pprev, next); if (next) WRITE_ONCE(next->pprev, pprev); } /** * hlist_del - Delete the specified hlist_node from its list * @n: Node to delete. * * Note that this function leaves the node in hashed state. Use * hlist_del_init() or similar instead to unhash @n. */ static inline void hlist_del(struct hlist_node *n) { __hlist_del(n); n->next = LIST_POISON1; n->pprev = LIST_POISON2; } /** * hlist_del_init - Delete the specified hlist_node from its list and initialize * @n: Node to delete. * * Note that this function leaves the node in unhashed state. */ static inline void hlist_del_init(struct hlist_node *n) { if (!hlist_unhashed(n)) { __hlist_del(n); INIT_HLIST_NODE(n); } } /** * hlist_add_head - add a new entry at the beginning of the hlist * @n: new entry to be added * @h: hlist head to add it after * * Insert a new entry after the specified head. * This is good for implementing stacks. */ static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h) { struct hlist_node *first = h->first; WRITE_ONCE(n->next, first); if (first) WRITE_ONCE(first->pprev, &n->next); WRITE_ONCE(h->first, n); WRITE_ONCE(n->pprev, &h->first); } /** * hlist_add_before - add a new entry before the one specified * @n: new entry to be added * @next: hlist node to add it before, which must be non-NULL */ static inline void hlist_add_before(struct hlist_node *n, struct hlist_node *next) { WRITE_ONCE(n->pprev, next->pprev); WRITE_ONCE(n->next, next); WRITE_ONCE(next->pprev, &n->next); WRITE_ONCE(*(n->pprev), n); } /** * hlist_add_behind - add a new entry after the one specified * @n: new entry to be added * @prev: hlist node to add it after, which must be non-NULL */ static inline void hlist_add_behind(struct hlist_node *n, struct hlist_node *prev) { WRITE_ONCE(n->next, prev->next); WRITE_ONCE(prev->next, n); WRITE_ONCE(n->pprev, &prev->next); if (n->next) WRITE_ONCE(n->next->pprev, &n->next); } /** * hlist_add_fake - create a fake hlist consisting of a single headless node * @n: Node to make a fake list out of * * This makes @n appear to be its own predecessor on a headless hlist. * The point of this is to allow things like hlist_del() to work correctly * in cases where there is no list. */ static inline void hlist_add_fake(struct hlist_node *n) { n->pprev = &n->next; } /** * hlist_fake: Is this node a fake hlist? * @h: Node to check for being a self-referential fake hlist. */ static inline bool hlist_fake(struct hlist_node *h) { return h->pprev == &h->next; } /** * hlist_is_singular_node - is node the only element of the specified hlist? * @n: Node to check for singularity. * @h: Header for potentially singular list. * * Check whether the node is the only node of the head without * accessing head, thus avoiding unnecessary cache misses. */ static inline bool hlist_is_singular_node(struct hlist_node *n, struct hlist_head *h) { return !n->next && n->pprev == &h->first; } /** * hlist_move_list - Move an hlist * @old: hlist_head for old list. * @new: hlist_head for new list. * * Move a list from one list head to another. Fixup the pprev * reference of the first entry if it exists. */ static inline void hlist_move_list(struct hlist_head *old, struct hlist_head *new) { new->first = old->first; if (new->first) new->first->pprev = &new->first; old->first = NULL; } /** * hlist_splice_init() - move all entries from one list to another * @from: hlist_head from which entries will be moved * @last: last entry on the @from list * @to: hlist_head to which entries will be moved * * @to can be empty, @from must contain at least @last. */ static inline void hlist_splice_init(struct hlist_head *from, struct hlist_node *last, struct hlist_head *to) { if (to->first) to->first->pprev = &last->next; last->next = to->first; to->first = from->first; from->first->pprev = &to->first; from->first = NULL; } #define hlist_entry(ptr, type, member) container_of(ptr,type,member) #define hlist_for_each(pos, head) \ for (pos = (head)->first; pos ; pos = pos->next) #define hlist_for_each_safe(pos, n, head) \ for (pos = (head)->first; pos && ({ n = pos->next; 1; }); \ pos = n) #define hlist_entry_safe(ptr, type, member) \ ({ typeof(ptr) ____ptr = (ptr); \ ____ptr ? hlist_entry(____ptr, type, member) : NULL; \ }) /** * hlist_for_each_entry - iterate over list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry(pos, head, member) \ for (pos = hlist_entry_safe((head)->first, typeof(*(pos)), member);\ pos; \ pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) /** * hlist_for_each_entry_continue - iterate over a hlist continuing after current point * @pos: the type * to use as a loop cursor. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_continue(pos, member) \ for (pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member);\ pos; \ pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) /** * hlist_for_each_entry_from - iterate over a hlist continuing from current point * @pos: the type * to use as a loop cursor. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_from(pos, member) \ for (; pos; \ pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) /** * hlist_for_each_entry_safe - iterate over list of given type safe against removal of list entry * @pos: the type * to use as a loop cursor. * @n: a &struct hlist_node to use as temporary storage * @head: the head for your list. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_safe(pos, n, head, member) \ for (pos = hlist_entry_safe((head)->first, typeof(*pos), member);\ pos && ({ n = pos->member.next; 1; }); \ pos = hlist_entry_safe(n, typeof(*pos), member)) /** * hlist_count_nodes - count nodes in the hlist * @head: the head for your hlist. */ static inline size_t hlist_count_nodes(struct hlist_head *head) { struct hlist_node *pos; size_t count = 0; hlist_for_each(pos, head) count++; return count; } #endif
1242 1239 6 3 2 2 13 1 8 8 4 9 4 5 34 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 // SPDX-License-Identifier: GPL-2.0-or-later /* * "TEE" target extension for Xtables * Copyright © Sebastian Claßen, 2007 * Jan Engelhardt, 2007-2010 * * based on ipt_ROUTE.c from Cédric de Launois * <delaunois@info.ucl.be> */ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/route.h> #include <linux/netfilter/x_tables.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/route.h> #include <net/netfilter/ipv4/nf_dup_ipv4.h> #include <net/netfilter/ipv6/nf_dup_ipv6.h> #include <linux/netfilter/xt_TEE.h> struct xt_tee_priv { struct list_head list; struct xt_tee_tginfo *tginfo; int oif; }; static unsigned int tee_net_id __read_mostly; static const union nf_inet_addr tee_zero_address; struct tee_net { struct list_head priv_list; /* lock protects the priv_list */ struct mutex lock; }; static unsigned int tee_tg4(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_tee_tginfo *info = par->targinfo; int oif = info->priv ? info->priv->oif : 0; nf_dup_ipv4(xt_net(par), skb, xt_hooknum(par), &info->gw.in, oif); return XT_CONTINUE; } #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) static unsigned int tee_tg6(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_tee_tginfo *info = par->targinfo; int oif = info->priv ? info->priv->oif : 0; nf_dup_ipv6(xt_net(par), skb, xt_hooknum(par), &info->gw.in6, oif); return XT_CONTINUE; } #endif static int tee_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct tee_net *tn = net_generic(net, tee_net_id); struct xt_tee_priv *priv; mutex_lock(&tn->lock); list_for_each_entry(priv, &tn->priv_list, list) { switch (event) { case NETDEV_REGISTER: if (!strcmp(dev->name, priv->tginfo->oif)) priv->oif = dev->ifindex; break; case NETDEV_UNREGISTER: if (dev->ifindex == priv->oif) priv->oif = -1; break; case NETDEV_CHANGENAME: if (!strcmp(dev->name, priv->tginfo->oif)) priv->oif = dev->ifindex; else if (dev->ifindex == priv->oif) priv->oif = -1; break; } } mutex_unlock(&tn->lock); return NOTIFY_DONE; } static int tee_tg_check(const struct xt_tgchk_param *par) { struct tee_net *tn = net_generic(par->net, tee_net_id); struct xt_tee_tginfo *info = par->targinfo; struct xt_tee_priv *priv; /* 0.0.0.0 and :: not allowed */ if (memcmp(&info->gw, &tee_zero_address, sizeof(tee_zero_address)) == 0) return -EINVAL; if (info->oif[0]) { struct net_device *dev; if (info->oif[sizeof(info->oif)-1] != '\0') return -EINVAL; priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (priv == NULL) return -ENOMEM; priv->tginfo = info; priv->oif = -1; info->priv = priv; dev = dev_get_by_name(par->net, info->oif); if (dev) { priv->oif = dev->ifindex; dev_put(dev); } mutex_lock(&tn->lock); list_add(&priv->list, &tn->priv_list); mutex_unlock(&tn->lock); } else info->priv = NULL; static_key_slow_inc(&xt_tee_enabled); return 0; } static void tee_tg_destroy(const struct xt_tgdtor_param *par) { struct tee_net *tn = net_generic(par->net, tee_net_id); struct xt_tee_tginfo *info = par->targinfo; if (info->priv) { mutex_lock(&tn->lock); list_del(&info->priv->list); mutex_unlock(&tn->lock); kfree(info->priv); } static_key_slow_dec(&xt_tee_enabled); } static struct xt_target tee_tg_reg[] __read_mostly = { { .name = "TEE", .revision = 1, .family = NFPROTO_IPV4, .target = tee_tg4, .targetsize = sizeof(struct xt_tee_tginfo), .usersize = offsetof(struct xt_tee_tginfo, priv), .checkentry = tee_tg_check, .destroy = tee_tg_destroy, .me = THIS_MODULE, }, #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { .name = "TEE", .revision = 1, .family = NFPROTO_IPV6, .target = tee_tg6, .targetsize = sizeof(struct xt_tee_tginfo), .usersize = offsetof(struct xt_tee_tginfo, priv), .checkentry = tee_tg_check, .destroy = tee_tg_destroy, .me = THIS_MODULE, }, #endif }; static int __net_init tee_net_init(struct net *net) { struct tee_net *tn = net_generic(net, tee_net_id); INIT_LIST_HEAD(&tn->priv_list); mutex_init(&tn->lock); return 0; } static struct pernet_operations tee_net_ops = { .init = tee_net_init, .id = &tee_net_id, .size = sizeof(struct tee_net), }; static struct notifier_block tee_netdev_notifier = { .notifier_call = tee_netdev_event, }; static int __init tee_tg_init(void) { int ret; ret = register_pernet_subsys(&tee_net_ops); if (ret < 0) return ret; ret = xt_register_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg)); if (ret < 0) goto cleanup_subsys; ret = register_netdevice_notifier(&tee_netdev_notifier); if (ret < 0) goto unregister_targets; return 0; unregister_targets: xt_unregister_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg)); cleanup_subsys: unregister_pernet_subsys(&tee_net_ops); return ret; } static void __exit tee_tg_exit(void) { unregister_netdevice_notifier(&tee_netdev_notifier); xt_unregister_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg)); unregister_pernet_subsys(&tee_net_ops); } module_init(tee_tg_init); module_exit(tee_tg_exit); MODULE_AUTHOR("Sebastian Claßen <sebastian.classen@freenet.ag>"); MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>"); MODULE_DESCRIPTION("Xtables: Reroute packet copy"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_TEE"); MODULE_ALIAS("ip6t_TEE");
4 319 278 123 267 166 167 120 27 120 119 319 334 10 10 9 10 330 2 324 10 328 10 10 331 3 9 6 2 3 126 126 29 8 127 127 285 338 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright Red Hat Inc. 2017 * * This file is part of the SCTP kernel implementation * * These functions manipulate sctp stream queue/scheduling. * * Please send any bug reports or fixes you make to the * email addresched(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> */ #include <linux/list.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <net/sctp/stream_sched.h> /* First Come First Serve (a.k.a. FIFO) * RFC DRAFT ndata Section 3.1 */ static int sctp_sched_fcfs_set(struct sctp_stream *stream, __u16 sid, __u16 value, gfp_t gfp) { return 0; } static int sctp_sched_fcfs_get(struct sctp_stream *stream, __u16 sid, __u16 *value) { *value = 0; return 0; } static int sctp_sched_fcfs_init(struct sctp_stream *stream) { return 0; } static int sctp_sched_fcfs_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp) { return 0; } static void sctp_sched_fcfs_free_sid(struct sctp_stream *stream, __u16 sid) { } static void sctp_sched_fcfs_enqueue(struct sctp_outq *q, struct sctp_datamsg *msg) { } static struct sctp_chunk *sctp_sched_fcfs_dequeue(struct sctp_outq *q) { struct sctp_stream *stream = &q->asoc->stream; struct sctp_chunk *ch = NULL; struct list_head *entry; if (list_empty(&q->out_chunk_list)) goto out; if (stream->out_curr) { ch = list_entry(stream->out_curr->ext->outq.next, struct sctp_chunk, stream_list); } else { entry = q->out_chunk_list.next; ch = list_entry(entry, struct sctp_chunk, list); } sctp_sched_dequeue_common(q, ch); out: return ch; } static void sctp_sched_fcfs_dequeue_done(struct sctp_outq *q, struct sctp_chunk *chunk) { } static void sctp_sched_fcfs_sched_all(struct sctp_stream *stream) { } static void sctp_sched_fcfs_unsched_all(struct sctp_stream *stream) { } static const struct sctp_sched_ops sctp_sched_fcfs = { .set = sctp_sched_fcfs_set, .get = sctp_sched_fcfs_get, .init = sctp_sched_fcfs_init, .init_sid = sctp_sched_fcfs_init_sid, .free_sid = sctp_sched_fcfs_free_sid, .enqueue = sctp_sched_fcfs_enqueue, .dequeue = sctp_sched_fcfs_dequeue, .dequeue_done = sctp_sched_fcfs_dequeue_done, .sched_all = sctp_sched_fcfs_sched_all, .unsched_all = sctp_sched_fcfs_unsched_all, }; static void sctp_sched_ops_fcfs_init(void) { sctp_sched_ops_register(SCTP_SS_FCFS, &sctp_sched_fcfs); } /* API to other parts of the stack */ static const struct sctp_sched_ops *sctp_sched_ops[SCTP_SS_MAX + 1]; void sctp_sched_ops_register(enum sctp_sched_type sched, const struct sctp_sched_ops *sched_ops) { sctp_sched_ops[sched] = sched_ops; } void sctp_sched_ops_init(void) { sctp_sched_ops_fcfs_init(); sctp_sched_ops_prio_init(); sctp_sched_ops_rr_init(); sctp_sched_ops_fc_init(); sctp_sched_ops_wfq_init(); } static void sctp_sched_free_sched(struct sctp_stream *stream) { const struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); struct sctp_stream_out_ext *soute; int i; sched->unsched_all(stream); for (i = 0; i < stream->outcnt; i++) { soute = SCTP_SO(stream, i)->ext; if (!soute) continue; sched->free_sid(stream, i); /* Give the next scheduler a clean slate. */ memset_after(soute, 0, outq); } } int sctp_sched_set_sched(struct sctp_association *asoc, enum sctp_sched_type sched) { const struct sctp_sched_ops *old = asoc->outqueue.sched; struct sctp_datamsg *msg = NULL; const struct sctp_sched_ops *n; struct sctp_chunk *ch; int i, ret = 0; if (sched > SCTP_SS_MAX) return -EINVAL; n = sctp_sched_ops[sched]; if (old == n) return ret; if (old) sctp_sched_free_sched(&asoc->stream); asoc->outqueue.sched = n; n->init(&asoc->stream); for (i = 0; i < asoc->stream.outcnt; i++) { if (!SCTP_SO(&asoc->stream, i)->ext) continue; ret = n->init_sid(&asoc->stream, i, GFP_ATOMIC); if (ret) goto err; } /* We have to requeue all chunks already queued. */ list_for_each_entry(ch, &asoc->outqueue.out_chunk_list, list) { if (ch->msg == msg) continue; msg = ch->msg; n->enqueue(&asoc->outqueue, msg); } return ret; err: sctp_sched_free_sched(&asoc->stream); asoc->outqueue.sched = &sctp_sched_fcfs; /* Always safe */ return ret; } int sctp_sched_get_sched(struct sctp_association *asoc) { int i; for (i = 0; i <= SCTP_SS_MAX; i++) if (asoc->outqueue.sched == sctp_sched_ops[i]) return i; return 0; } int sctp_sched_set_value(struct sctp_association *asoc, __u16 sid, __u16 value, gfp_t gfp) { if (sid >= asoc->stream.outcnt) return -EINVAL; if (!SCTP_SO(&asoc->stream, sid)->ext) { int ret; ret = sctp_stream_init_ext(&asoc->stream, sid); if (ret) return ret; } return asoc->outqueue.sched->set(&asoc->stream, sid, value, gfp); } int sctp_sched_get_value(struct sctp_association *asoc, __u16 sid, __u16 *value) { if (sid >= asoc->stream.outcnt) return -EINVAL; if (!SCTP_SO(&asoc->stream, sid)->ext) return 0; return asoc->outqueue.sched->get(&asoc->stream, sid, value); } void sctp_sched_dequeue_done(struct sctp_outq *q, struct sctp_chunk *ch) { if (!list_is_last(&ch->frag_list, &ch->msg->chunks) && !q->asoc->peer.intl_capable) { struct sctp_stream_out *sout; __u16 sid; /* datamsg is not finish, so save it as current one, * in case application switch scheduler or a higher * priority stream comes in. */ sid = sctp_chunk_stream_no(ch); sout = SCTP_SO(&q->asoc->stream, sid); q->asoc->stream.out_curr = sout; return; } q->asoc->stream.out_curr = NULL; q->sched->dequeue_done(q, ch); } /* Auxiliary functions for the schedulers */ void sctp_sched_dequeue_common(struct sctp_outq *q, struct sctp_chunk *ch) { list_del_init(&ch->list); list_del_init(&ch->stream_list); q->out_qlen -= ch->skb->len; } int sctp_sched_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp) { const struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); struct sctp_stream_out_ext *ext = SCTP_SO(stream, sid)->ext; INIT_LIST_HEAD(&ext->outq); return sched->init_sid(stream, sid, gfp); } const struct sctp_sched_ops *sctp_sched_ops_from_stream(struct sctp_stream *stream) { struct sctp_association *asoc; asoc = container_of(stream, struct sctp_association, stream); return asoc->outqueue.sched; }
47 47 216 216 216 215 245 245 179 216 150 130 64 64 64 1 1 3 3 3 10 10 17 18 18 11 3 40 40 40 40 40 40 40 40 1 1 1 1 71 17 57 45 57 56 15 57 1 1 1 40 39 40 1 39 25 59 59 59 27 7 21 21 21 21 21 64 64 64 8 21 64 1 64 8 59 14 8 64 2 63 63 63 25 25 25 25 25 25 25 25 25 24 1 1 8 8 25 25 25 25 25 25 25 25 8 25 25 25 25 25 25 25 1 2 25 25 25 25 25 25 25 25 51 51 51 7 25 7 25 25 47 25 47 2 25 1 1 1 1 179 179 179 5 40 41 13 120 28 28 7 28 23 209 208 194 193 194 62 62 19 54 53 28 5 28 7 28 28 28 25 7 25 25 42 14 12 158 10 10 10 10 10 417 418 420 419 6 6 6 6 19 18 1 18 18 18 11 10 10 10 1 1 7 2 5 5 3 16 17 16 16 3 14 133 131 2 144 144 144 144 15 133 15 144 144 133 15 15 144 17 131 131 130 16 16 16 16 16 1 1 1 1 1 1 1 1 140 140 140 140 140 126 126 126 1 125 125 66 66 177 177 66 66 66 66 66 66 140 140 18 18 5 26 14 14 13 13 13 147 47 200 32 10 10 10 34 34 34 25 10 10 10 10 10 18 18 18 18 18 18 135 135 19 19 19 13 1 10 3 16 16 16 18 19 19 18 7 19 7 18 19 107 107 107 46 64 140 140 140 140 125 126 1 126 126 126 47 47 18 17 49 49 1 48 44 44 1 43 420 417 417 45 46 1 46 46 1 46 417 49 415 57 47 47 57 90 215 7 7 3 5 5 3 17 17 4 6 6 3 4 177 176 176 177 176 176 175 118 107 176 117 107 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 // SPDX-License-Identifier: GPL-2.0 /* Multipath TCP * * Copyright (c) 2017 - 2019, Intel Corporation. */ #define pr_fmt(fmt) "MPTCP: " fmt #include <linux/kernel.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/sched/signal.h> #include <linux/atomic.h> #include <net/aligned_data.h> #include <net/rps.h> #include <net/sock.h> #include <net/inet_common.h> #include <net/inet_hashtables.h> #include <net/protocol.h> #include <net/tcp_states.h> #if IS_ENABLED(CONFIG_MPTCP_IPV6) #include <net/transp_v6.h> #endif #include <net/mptcp.h> #include <net/hotdata.h> #include <net/xfrm.h> #include <asm/ioctls.h> #include "protocol.h" #include "mib.h" #define CREATE_TRACE_POINTS #include <trace/events/mptcp.h> #if IS_ENABLED(CONFIG_MPTCP_IPV6) struct mptcp6_sock { struct mptcp_sock msk; struct ipv6_pinfo np; }; #endif enum { MPTCP_CMSG_TS = BIT(0), MPTCP_CMSG_INQ = BIT(1), }; static struct percpu_counter mptcp_sockets_allocated ____cacheline_aligned_in_smp; static void __mptcp_destroy_sock(struct sock *sk); static void mptcp_check_send_data_fin(struct sock *sk); DEFINE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions) = { .bh_lock = INIT_LOCAL_LOCK(bh_lock), }; static struct net_device *mptcp_napi_dev; /* Returns end sequence number of the receiver's advertised window */ static u64 mptcp_wnd_end(const struct mptcp_sock *msk) { return READ_ONCE(msk->wnd_end); } static const struct proto_ops *mptcp_fallback_tcp_ops(const struct sock *sk) { unsigned short family = READ_ONCE(sk->sk_family); #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (family == AF_INET6) return &inet6_stream_ops; #endif WARN_ON_ONCE(family != AF_INET); return &inet_stream_ops; } bool __mptcp_try_fallback(struct mptcp_sock *msk, int fb_mib) { struct net *net = sock_net((struct sock *)msk); if (__mptcp_check_fallback(msk)) return true; /* The caller possibly is not holding the msk socket lock, but * in the fallback case only the current subflow is touching * the OoO queue. */ if (!RB_EMPTY_ROOT(&msk->out_of_order_queue)) return false; spin_lock_bh(&msk->fallback_lock); if (!msk->allow_infinite_fallback) { spin_unlock_bh(&msk->fallback_lock); return false; } msk->allow_subflows = false; set_bit(MPTCP_FALLBACK_DONE, &msk->flags); __MPTCP_INC_STATS(net, fb_mib); spin_unlock_bh(&msk->fallback_lock); return true; } static int __mptcp_socket_create(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; struct socket *ssock; int err; err = mptcp_subflow_create_socket(sk, sk->sk_family, &ssock); if (err) return err; msk->scaling_ratio = tcp_sk(ssock->sk)->scaling_ratio; WRITE_ONCE(msk->first, ssock->sk); subflow = mptcp_subflow_ctx(ssock->sk); list_add(&subflow->node, &msk->conn_list); sock_hold(ssock->sk); subflow->request_mptcp = 1; subflow->subflow_id = msk->subflow_id++; /* This is the first subflow, always with id 0 */ WRITE_ONCE(subflow->local_id, 0); mptcp_sock_graft(msk->first, sk->sk_socket); iput(SOCK_INODE(ssock)); return 0; } /* If the MPC handshake is not started, returns the first subflow, * eventually allocating it. */ struct sock *__mptcp_nmpc_sk(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; int ret; if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) return ERR_PTR(-EINVAL); if (!msk->first) { ret = __mptcp_socket_create(msk); if (ret) return ERR_PTR(ret); } return msk->first; } static void mptcp_drop(struct sock *sk, struct sk_buff *skb) { sk_drops_skbadd(sk, skb); __kfree_skb(skb); } static bool __mptcp_try_coalesce(struct sock *sk, struct sk_buff *to, struct sk_buff *from, bool *fragstolen, int *delta) { int limit = READ_ONCE(sk->sk_rcvbuf); if (unlikely(MPTCP_SKB_CB(to)->cant_coalesce) || MPTCP_SKB_CB(from)->offset || ((to->len + from->len) > (limit >> 3)) || !skb_try_coalesce(to, from, fragstolen, delta)) return false; pr_debug("colesced seq %llx into %llx new len %d new end seq %llx\n", MPTCP_SKB_CB(from)->map_seq, MPTCP_SKB_CB(to)->map_seq, to->len, MPTCP_SKB_CB(from)->end_seq); MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq; return true; } static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to, struct sk_buff *from) { bool fragstolen; int delta; if (!__mptcp_try_coalesce(sk, to, from, &fragstolen, &delta)) return false; /* note the fwd memory can reach a negative value after accounting * for the delta, but the later skb free will restore a non * negative one */ atomic_add(delta, &sk->sk_rmem_alloc); sk_mem_charge(sk, delta); kfree_skb_partial(from, fragstolen); return true; } static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to, struct sk_buff *from) { if (MPTCP_SKB_CB(from)->map_seq != MPTCP_SKB_CB(to)->end_seq) return false; return mptcp_try_coalesce((struct sock *)msk, to, from); } /* "inspired" by tcp_rcvbuf_grow(), main difference: * - mptcp does not maintain a msk-level window clamp * - returns true when the receive buffer is actually updated */ static bool mptcp_rcvbuf_grow(struct sock *sk, u32 newval) { struct mptcp_sock *msk = mptcp_sk(sk); const struct net *net = sock_net(sk); u32 rcvwin, rcvbuf, cap, oldval; u64 grow; oldval = msk->rcvq_space.space; msk->rcvq_space.space = newval; if (!READ_ONCE(net->ipv4.sysctl_tcp_moderate_rcvbuf) || (sk->sk_userlocks & SOCK_RCVBUF_LOCK)) return false; /* DRS is always one RTT late. */ rcvwin = newval << 1; /* slow start: allow the sender to double its rate. */ grow = (u64)rcvwin * (newval - oldval); do_div(grow, oldval); rcvwin += grow << 1; if (!RB_EMPTY_ROOT(&msk->out_of_order_queue)) rcvwin += MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq - msk->ack_seq; cap = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]); rcvbuf = min_t(u32, mptcp_space_from_win(sk, rcvwin), cap); if (rcvbuf > sk->sk_rcvbuf) { WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); return true; } return false; } /* "inspired" by tcp_data_queue_ofo(), main differences: * - use mptcp seqs * - don't cope with sacks */ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb) { struct sock *sk = (struct sock *)msk; struct rb_node **p, *parent; u64 seq, end_seq, max_seq; struct sk_buff *skb1; seq = MPTCP_SKB_CB(skb)->map_seq; end_seq = MPTCP_SKB_CB(skb)->end_seq; max_seq = atomic64_read(&msk->rcv_wnd_sent); pr_debug("msk=%p seq=%llx limit=%llx empty=%d\n", msk, seq, max_seq, RB_EMPTY_ROOT(&msk->out_of_order_queue)); if (after64(end_seq, max_seq)) { /* out of window */ mptcp_drop(sk, skb); pr_debug("oow by %lld, rcv_wnd_sent %llu\n", (unsigned long long)end_seq - (unsigned long)max_seq, (unsigned long long)atomic64_read(&msk->rcv_wnd_sent)); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_NODSSWINDOW); return; } p = &msk->out_of_order_queue.rb_node; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUE); if (RB_EMPTY_ROOT(&msk->out_of_order_queue)) { rb_link_node(&skb->rbnode, NULL, p); rb_insert_color(&skb->rbnode, &msk->out_of_order_queue); msk->ooo_last_skb = skb; goto end; } /* with 2 subflows, adding at end of ooo queue is quite likely * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup. */ if (mptcp_ooo_try_coalesce(msk, msk->ooo_last_skb, skb)) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOMERGE); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUETAIL); return; } /* Can avoid an rbtree lookup if we are adding skb after ooo_last_skb */ if (!before64(seq, MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq)) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUETAIL); parent = &msk->ooo_last_skb->rbnode; p = &parent->rb_right; goto insert; } /* Find place to insert this segment. Handle overlaps on the way. */ parent = NULL; while (*p) { parent = *p; skb1 = rb_to_skb(parent); if (before64(seq, MPTCP_SKB_CB(skb1)->map_seq)) { p = &parent->rb_left; continue; } if (before64(seq, MPTCP_SKB_CB(skb1)->end_seq)) { if (!after64(end_seq, MPTCP_SKB_CB(skb1)->end_seq)) { /* All the bits are present. Drop. */ mptcp_drop(sk, skb); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); return; } if (after64(seq, MPTCP_SKB_CB(skb1)->map_seq)) { /* partial overlap: * | skb | * | skb1 | * continue traversing */ } else { /* skb's seq == skb1's seq and skb covers skb1. * Replace skb1 with skb. */ rb_replace_node(&skb1->rbnode, &skb->rbnode, &msk->out_of_order_queue); mptcp_drop(sk, skb1); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); goto merge_right; } } else if (mptcp_ooo_try_coalesce(msk, skb1, skb)) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOMERGE); return; } p = &parent->rb_right; } insert: /* Insert segment into RB tree. */ rb_link_node(&skb->rbnode, parent, p); rb_insert_color(&skb->rbnode, &msk->out_of_order_queue); merge_right: /* Remove other segments covered by skb. */ while ((skb1 = skb_rb_next(skb)) != NULL) { if (before64(end_seq, MPTCP_SKB_CB(skb1)->end_seq)) break; rb_erase(&skb1->rbnode, &msk->out_of_order_queue); mptcp_drop(sk, skb1); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); } /* If there is no skb after us, we are the last_skb ! */ if (!skb1) msk->ooo_last_skb = skb; end: skb_condense(skb); skb_set_owner_r(skb, sk); /* do not grow rcvbuf for not-yet-accepted or orphaned sockets. */ if (sk->sk_socket) mptcp_rcvbuf_grow(sk, msk->rcvq_space.space); } static void mptcp_init_skb(struct sock *ssk, struct sk_buff *skb, int offset, int copy_len) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); bool has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp; /* the skb map_seq accounts for the skb offset: * mptcp_subflow_get_mapped_dsn() is based on the current tp->copied_seq * value */ MPTCP_SKB_CB(skb)->map_seq = mptcp_subflow_get_mapped_dsn(subflow); MPTCP_SKB_CB(skb)->end_seq = MPTCP_SKB_CB(skb)->map_seq + copy_len; MPTCP_SKB_CB(skb)->offset = offset; MPTCP_SKB_CB(skb)->has_rxtstamp = has_rxtstamp; MPTCP_SKB_CB(skb)->cant_coalesce = 0; __skb_unlink(skb, &ssk->sk_receive_queue); skb_ext_reset(skb); skb_dst_drop(skb); } static bool __mptcp_move_skb(struct sock *sk, struct sk_buff *skb) { u64 copy_len = MPTCP_SKB_CB(skb)->end_seq - MPTCP_SKB_CB(skb)->map_seq; struct mptcp_sock *msk = mptcp_sk(sk); struct sk_buff *tail; mptcp_borrow_fwdmem(sk, skb); if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) { /* in sequence */ msk->bytes_received += copy_len; WRITE_ONCE(msk->ack_seq, msk->ack_seq + copy_len); tail = skb_peek_tail(&sk->sk_receive_queue); if (tail && mptcp_try_coalesce(sk, tail, skb)) return true; skb_set_owner_r(skb, sk); __skb_queue_tail(&sk->sk_receive_queue, skb); return true; } else if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq)) { mptcp_data_queue_ofo(msk, skb); return false; } /* old data, keep it simple and drop the whole pkt, sender * will retransmit as needed, if needed. */ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); mptcp_drop(sk, skb); return false; } static void mptcp_stop_rtx_timer(struct sock *sk) { sk_stop_timer(sk, &sk->mptcp_retransmit_timer); mptcp_sk(sk)->timer_ival = 0; } static void mptcp_close_wake_up(struct sock *sk) { if (sock_flag(sk, SOCK_DEAD)) return; sk->sk_state_change(sk); if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE) sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); else sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); } static void mptcp_shutdown_subflows(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bool slow; slow = lock_sock_fast(ssk); tcp_shutdown(ssk, SEND_SHUTDOWN); unlock_sock_fast(ssk, slow); } } /* called under the msk socket lock */ static bool mptcp_pending_data_fin_ack(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); return ((1 << sk->sk_state) & (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK)) && msk->write_seq == READ_ONCE(msk->snd_una); } static void mptcp_check_data_fin_ack(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); /* Look for an acknowledged DATA_FIN */ if (mptcp_pending_data_fin_ack(sk)) { WRITE_ONCE(msk->snd_data_fin_enable, 0); switch (sk->sk_state) { case TCP_FIN_WAIT1: mptcp_set_state(sk, TCP_FIN_WAIT2); break; case TCP_CLOSING: case TCP_LAST_ACK: mptcp_shutdown_subflows(msk); mptcp_set_state(sk, TCP_CLOSE); break; } mptcp_close_wake_up(sk); } } /* can be called with no lock acquired */ static bool mptcp_pending_data_fin(struct sock *sk, u64 *seq) { struct mptcp_sock *msk = mptcp_sk(sk); if (READ_ONCE(msk->rcv_data_fin) && ((1 << inet_sk_state_load(sk)) & (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2))) { u64 rcv_data_fin_seq = READ_ONCE(msk->rcv_data_fin_seq); if (READ_ONCE(msk->ack_seq) == rcv_data_fin_seq) { if (seq) *seq = rcv_data_fin_seq; return true; } } return false; } static void mptcp_set_datafin_timeout(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); u32 retransmits; retransmits = min_t(u32, icsk->icsk_retransmits, ilog2(TCP_RTO_MAX / TCP_RTO_MIN)); mptcp_sk(sk)->timer_ival = TCP_RTO_MIN << retransmits; } static void __mptcp_set_timeout(struct sock *sk, long tout) { mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN; } static long mptcp_timeout_from_subflow(const struct mptcp_subflow_context *subflow) { const struct sock *ssk = mptcp_subflow_tcp_sock(subflow); return inet_csk(ssk)->icsk_pending && !subflow->stale_count ? tcp_timeout_expires(ssk) - jiffies : 0; } static void mptcp_set_timeout(struct sock *sk) { struct mptcp_subflow_context *subflow; long tout = 0; mptcp_for_each_subflow(mptcp_sk(sk), subflow) tout = max(tout, mptcp_timeout_from_subflow(subflow)); __mptcp_set_timeout(sk, tout); } static inline bool tcp_can_send_ack(const struct sock *ssk) { return !((1 << inet_sk_state_load(ssk)) & (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_TIME_WAIT | TCPF_CLOSE | TCPF_LISTEN)); } void __mptcp_subflow_send_ack(struct sock *ssk) { if (tcp_can_send_ack(ssk)) tcp_send_ack(ssk); } static void mptcp_subflow_send_ack(struct sock *ssk) { bool slow; slow = lock_sock_fast(ssk); __mptcp_subflow_send_ack(ssk); unlock_sock_fast(ssk, slow); } static void mptcp_send_ack(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; mptcp_for_each_subflow(msk, subflow) mptcp_subflow_send_ack(mptcp_subflow_tcp_sock(subflow)); } static void mptcp_subflow_cleanup_rbuf(struct sock *ssk, int copied) { bool slow; slow = lock_sock_fast(ssk); if (tcp_can_send_ack(ssk)) tcp_cleanup_rbuf(ssk, copied); unlock_sock_fast(ssk, slow); } static bool mptcp_subflow_could_cleanup(const struct sock *ssk, bool rx_empty) { const struct inet_connection_sock *icsk = inet_csk(ssk); u8 ack_pending = READ_ONCE(icsk->icsk_ack.pending); const struct tcp_sock *tp = tcp_sk(ssk); return (ack_pending & ICSK_ACK_SCHED) && ((READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->rcv_wup) > READ_ONCE(icsk->icsk_ack.rcv_mss)) || (rx_empty && ack_pending & (ICSK_ACK_PUSHED2 | ICSK_ACK_PUSHED))); } static void mptcp_cleanup_rbuf(struct mptcp_sock *msk, int copied) { int old_space = READ_ONCE(msk->old_wspace); struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; int space = __mptcp_space(sk); bool cleanup, rx_empty; cleanup = (space > 0) && (space >= (old_space << 1)) && copied; rx_empty = !sk_rmem_alloc_get(sk) && copied; mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); if (cleanup || mptcp_subflow_could_cleanup(ssk, rx_empty)) mptcp_subflow_cleanup_rbuf(ssk, copied); } } static void mptcp_check_data_fin(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); u64 rcv_data_fin_seq; /* Need to ack a DATA_FIN received from a peer while this side * of the connection is in ESTABLISHED, FIN_WAIT1, or FIN_WAIT2. * msk->rcv_data_fin was set when parsing the incoming options * at the subflow level and the msk lock was not held, so this * is the first opportunity to act on the DATA_FIN and change * the msk state. * * If we are caught up to the sequence number of the incoming * DATA_FIN, send the DATA_ACK now and do state transition. If * not caught up, do nothing and let the recv code send DATA_ACK * when catching up. */ if (mptcp_pending_data_fin(sk, &rcv_data_fin_seq)) { WRITE_ONCE(msk->ack_seq, msk->ack_seq + 1); WRITE_ONCE(msk->rcv_data_fin, 0); WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | RCV_SHUTDOWN); smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ switch (sk->sk_state) { case TCP_ESTABLISHED: mptcp_set_state(sk, TCP_CLOSE_WAIT); break; case TCP_FIN_WAIT1: mptcp_set_state(sk, TCP_CLOSING); break; case TCP_FIN_WAIT2: mptcp_shutdown_subflows(msk); mptcp_set_state(sk, TCP_CLOSE); break; default: /* Other states not expected */ WARN_ON_ONCE(1); break; } if (!__mptcp_check_fallback(msk)) mptcp_send_ack(msk); mptcp_close_wake_up(sk); } } static void mptcp_dss_corruption(struct mptcp_sock *msk, struct sock *ssk) { if (!mptcp_try_fallback(ssk, MPTCP_MIB_DSSCORRUPTIONFALLBACK)) { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSCORRUPTIONRESET); mptcp_subflow_reset(ssk); } } static void __mptcp_add_backlog(struct sock *sk, struct mptcp_subflow_context *subflow, struct sk_buff *skb) { struct mptcp_sock *msk = mptcp_sk(sk); struct sk_buff *tail = NULL; struct sock *ssk = skb->sk; bool fragstolen; int delta; if (unlikely(sk->sk_state == TCP_CLOSE)) { kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE); return; } /* Try to coalesce with the last skb in our backlog */ if (!list_empty(&msk->backlog_list)) tail = list_last_entry(&msk->backlog_list, struct sk_buff, list); if (tail && MPTCP_SKB_CB(skb)->map_seq == MPTCP_SKB_CB(tail)->end_seq && ssk == tail->sk && __mptcp_try_coalesce(sk, tail, skb, &fragstolen, &delta)) { skb->truesize -= delta; kfree_skb_partial(skb, fragstolen); __mptcp_subflow_lend_fwdmem(subflow, delta); goto account; } list_add_tail(&skb->list, &msk->backlog_list); mptcp_subflow_lend_fwdmem(subflow, skb); delta = skb->truesize; account: WRITE_ONCE(msk->backlog_len, msk->backlog_len + delta); /* Possibly not accept()ed yet, keep track of memory not CG * accounted, mptcp_graft_subflows() will handle it. */ if (!mem_cgroup_from_sk(ssk)) msk->backlog_unaccounted += delta; } static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, struct sock *ssk, bool own_msk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = (struct sock *)msk; bool more_data_avail; struct tcp_sock *tp; bool ret = false; pr_debug("msk=%p ssk=%p\n", msk, ssk); tp = tcp_sk(ssk); do { u32 map_remaining, offset; u32 seq = tp->copied_seq; struct sk_buff *skb; bool fin; /* try to move as much data as available */ map_remaining = subflow->map_data_len - mptcp_subflow_get_map_offset(subflow); skb = skb_peek(&ssk->sk_receive_queue); if (unlikely(!skb)) break; if (__mptcp_check_fallback(msk)) { /* Under fallback skbs have no MPTCP extension and TCP could * collapse them between the dummy map creation and the * current dequeue. Be sure to adjust the map size. */ map_remaining = skb->len; subflow->map_data_len = skb->len; } offset = seq - TCP_SKB_CB(skb)->seq; fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; if (fin) seq++; if (offset < skb->len) { size_t len = skb->len - offset; mptcp_init_skb(ssk, skb, offset, len); if (own_msk && sk_rmem_alloc_get(sk) < sk->sk_rcvbuf) { mptcp_subflow_lend_fwdmem(subflow, skb); ret |= __mptcp_move_skb(sk, skb); } else { __mptcp_add_backlog(sk, subflow, skb); } seq += len; if (unlikely(map_remaining < len)) { DEBUG_NET_WARN_ON_ONCE(1); mptcp_dss_corruption(msk, ssk); } } else { if (unlikely(!fin)) { DEBUG_NET_WARN_ON_ONCE(1); mptcp_dss_corruption(msk, ssk); } sk_eat_skb(ssk, skb); } WRITE_ONCE(tp->copied_seq, seq); more_data_avail = mptcp_subflow_data_available(ssk); } while (more_data_avail); if (ret) msk->last_data_recv = tcp_jiffies32; return ret; } static bool __mptcp_ofo_queue(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; struct sk_buff *skb, *tail; bool moved = false; struct rb_node *p; u64 end_seq; p = rb_first(&msk->out_of_order_queue); pr_debug("msk=%p empty=%d\n", msk, RB_EMPTY_ROOT(&msk->out_of_order_queue)); while (p) { skb = rb_to_skb(p); if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq)) break; p = rb_next(p); rb_erase(&skb->rbnode, &msk->out_of_order_queue); if (unlikely(!after64(MPTCP_SKB_CB(skb)->end_seq, msk->ack_seq))) { mptcp_drop(sk, skb); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); continue; } end_seq = MPTCP_SKB_CB(skb)->end_seq; tail = skb_peek_tail(&sk->sk_receive_queue); if (!tail || !mptcp_ooo_try_coalesce(msk, tail, skb)) { int delta = msk->ack_seq - MPTCP_SKB_CB(skb)->map_seq; /* skip overlapping data, if any */ pr_debug("uncoalesced seq=%llx ack seq=%llx delta=%d\n", MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq, delta); MPTCP_SKB_CB(skb)->offset += delta; MPTCP_SKB_CB(skb)->map_seq += delta; __skb_queue_tail(&sk->sk_receive_queue, skb); } msk->bytes_received += end_seq - msk->ack_seq; WRITE_ONCE(msk->ack_seq, end_seq); moved = true; } return moved; } static bool __mptcp_subflow_error_report(struct sock *sk, struct sock *ssk) { int err = sock_error(ssk); int ssk_state; if (!err) return false; /* only propagate errors on fallen-back sockets or * on MPC connect */ if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(mptcp_sk(sk))) return false; /* We need to propagate only transition to CLOSE state. * Orphaned socket will see such state change via * subflow_sched_work_if_closed() and that path will properly * destroy the msk as needed. */ ssk_state = inet_sk_state_load(ssk); if (ssk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DEAD)) mptcp_set_state(sk, ssk_state); WRITE_ONCE(sk->sk_err, -err); /* This barrier is coupled with smp_rmb() in mptcp_poll() */ smp_wmb(); sk_error_report(sk); return true; } void __mptcp_error_report(struct sock *sk) { struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); mptcp_for_each_subflow(msk, subflow) if (__mptcp_subflow_error_report(sk, mptcp_subflow_tcp_sock(subflow))) break; } /* In most cases we will be able to lock the mptcp socket. If its already * owned, we need to defer to the work queue to avoid ABBA deadlock. */ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) { struct sock *sk = (struct sock *)msk; bool moved; moved = __mptcp_move_skbs_from_subflow(msk, ssk, true); __mptcp_ofo_queue(msk); if (unlikely(ssk->sk_err)) __mptcp_subflow_error_report(sk, ssk); /* If the moves have caught up with the DATA_FIN sequence number * it's time to ack the DATA_FIN and change socket state, but * this is not a good place to change state. Let the workqueue * do it. */ if (mptcp_pending_data_fin(sk, NULL)) mptcp_schedule_work(sk); return moved; } void mptcp_data_ready(struct sock *sk, struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct mptcp_sock *msk = mptcp_sk(sk); /* The peer can send data while we are shutting down this * subflow at subflow destruction time, but we must avoid enqueuing * more data to the msk receive queue */ if (unlikely(subflow->closing)) return; mptcp_data_lock(sk); if (!sock_owned_by_user(sk)) { /* Wake-up the reader only for in-sequence data */ if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk)) sk->sk_data_ready(sk); } else { __mptcp_move_skbs_from_subflow(msk, ssk, false); } mptcp_data_unlock(sk); } static void mptcp_subflow_joined(struct mptcp_sock *msk, struct sock *ssk) { mptcp_subflow_ctx(ssk)->map_seq = READ_ONCE(msk->ack_seq); msk->allow_infinite_fallback = false; mptcp_event(MPTCP_EVENT_SUB_ESTABLISHED, msk, ssk, GFP_ATOMIC); } static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk) { struct sock *sk = (struct sock *)msk; if (sk->sk_state != TCP_ESTABLISHED) return false; spin_lock_bh(&msk->fallback_lock); if (!msk->allow_subflows) { spin_unlock_bh(&msk->fallback_lock); return false; } mptcp_subflow_joined(msk, ssk); spin_unlock_bh(&msk->fallback_lock); mptcp_subflow_ctx(ssk)->subflow_id = msk->subflow_id++; mptcp_sockopt_sync_locked(msk, ssk); mptcp_stop_tout_timer(sk); __mptcp_propagate_sndbuf(sk, ssk); return true; } static void __mptcp_flush_join_list(struct sock *sk, struct list_head *join_list) { struct mptcp_subflow_context *tmp, *subflow; struct mptcp_sock *msk = mptcp_sk(sk); list_for_each_entry_safe(subflow, tmp, join_list, node) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bool slow = lock_sock_fast(ssk); list_move_tail(&subflow->node, &msk->conn_list); if (!__mptcp_finish_join(msk, ssk)) mptcp_subflow_reset(ssk); unlock_sock_fast(ssk, slow); } } static bool mptcp_rtx_timer_pending(struct sock *sk) { return timer_pending(&sk->mptcp_retransmit_timer); } static void mptcp_reset_rtx_timer(struct sock *sk) { unsigned long tout; /* prevent rescheduling on close */ if (unlikely(inet_sk_state_load(sk) == TCP_CLOSE)) return; tout = mptcp_sk(sk)->timer_ival; sk_reset_timer(sk, &sk->mptcp_retransmit_timer, jiffies + tout); } bool mptcp_schedule_work(struct sock *sk) { if (inet_sk_state_load(sk) == TCP_CLOSE) return false; /* Get a reference on this socket, mptcp_worker() will release it. * As mptcp_worker() might complete before us, we can not avoid * a sock_hold()/sock_put() if schedule_work() returns false. */ sock_hold(sk); if (schedule_work(&mptcp_sk(sk)->work)) return true; sock_put(sk); return false; } static bool mptcp_skb_can_collapse_to(u64 write_seq, const struct sk_buff *skb, const struct mptcp_ext *mpext) { if (!tcp_skb_can_collapse_to(skb)) return false; /* can collapse only if MPTCP level sequence is in order and this * mapping has not been xmitted yet */ return mpext && mpext->data_seq + mpext->data_len == write_seq && !mpext->frozen; } /* we can append data to the given data frag if: * - there is space available in the backing page_frag * - the data frag tail matches the current page_frag free offset * - the data frag end sequence number matches the current write seq */ static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk, const struct page_frag *pfrag, const struct mptcp_data_frag *df) { return df && pfrag->page == df->page && pfrag->size - pfrag->offset > 0 && pfrag->offset == (df->offset + df->data_len) && df->data_seq + df->data_len == msk->write_seq; } static void dfrag_uncharge(struct sock *sk, int len) { sk_mem_uncharge(sk, len); sk_wmem_queued_add(sk, -len); } static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag) { int len = dfrag->data_len + dfrag->overhead; list_del(&dfrag->list); dfrag_uncharge(sk, len); put_page(dfrag->page); } /* called under both the msk socket lock and the data lock */ static void __mptcp_clean_una(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_data_frag *dtmp, *dfrag; u64 snd_una; snd_una = msk->snd_una; list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) { if (after64(dfrag->data_seq + dfrag->data_len, snd_una)) break; if (unlikely(dfrag == msk->first_pending)) { /* in recovery mode can see ack after the current snd head */ if (WARN_ON_ONCE(!msk->recovery)) break; msk->first_pending = mptcp_send_next(sk); } dfrag_clear(sk, dfrag); } dfrag = mptcp_rtx_head(sk); if (dfrag && after64(snd_una, dfrag->data_seq)) { u64 delta = snd_una - dfrag->data_seq; /* prevent wrap around in recovery mode */ if (unlikely(delta > dfrag->already_sent)) { if (WARN_ON_ONCE(!msk->recovery)) goto out; if (WARN_ON_ONCE(delta > dfrag->data_len)) goto out; dfrag->already_sent += delta - dfrag->already_sent; } dfrag->data_seq += delta; dfrag->offset += delta; dfrag->data_len -= delta; dfrag->already_sent -= delta; dfrag_uncharge(sk, delta); } /* all retransmitted data acked, recovery completed */ if (unlikely(msk->recovery) && after64(msk->snd_una, msk->recovery_snd_nxt)) msk->recovery = false; out: if (snd_una == msk->snd_nxt && snd_una == msk->write_seq) { if (mptcp_rtx_timer_pending(sk) && !mptcp_data_fin_enabled(msk)) mptcp_stop_rtx_timer(sk); } else { mptcp_reset_rtx_timer(sk); } if (mptcp_pending_data_fin_ack(sk)) mptcp_schedule_work(sk); } static void __mptcp_clean_una_wakeup(struct sock *sk) { lockdep_assert_held_once(&sk->sk_lock.slock); __mptcp_clean_una(sk); mptcp_write_space(sk); } static void mptcp_clean_una_wakeup(struct sock *sk) { mptcp_data_lock(sk); __mptcp_clean_una_wakeup(sk); mptcp_data_unlock(sk); } static void mptcp_enter_memory_pressure(struct sock *sk) { struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); bool first = true; mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); if (first && !ssk->sk_bypass_prot_mem) { tcp_enter_memory_pressure(ssk); first = false; } sk_stream_moderate_sndbuf(ssk); } __mptcp_sync_sndbuf(sk); } /* ensure we get enough memory for the frag hdr, beyond some minimal amount of * data */ static bool mptcp_page_frag_refill(struct sock *sk, struct page_frag *pfrag) { if (likely(skb_page_frag_refill(32U + sizeof(struct mptcp_data_frag), pfrag, sk->sk_allocation))) return true; mptcp_enter_memory_pressure(sk); return false; } static struct mptcp_data_frag * mptcp_carve_data_frag(const struct mptcp_sock *msk, struct page_frag *pfrag, int orig_offset) { int offset = ALIGN(orig_offset, sizeof(long)); struct mptcp_data_frag *dfrag; dfrag = (struct mptcp_data_frag *)(page_to_virt(pfrag->page) + offset); dfrag->data_len = 0; dfrag->data_seq = msk->write_seq; dfrag->overhead = offset - orig_offset + sizeof(struct mptcp_data_frag); dfrag->offset = offset + sizeof(struct mptcp_data_frag); dfrag->already_sent = 0; dfrag->page = pfrag->page; return dfrag; } struct mptcp_sendmsg_info { int mss_now; int size_goal; u16 limit; u16 sent; unsigned int flags; bool data_lock_held; }; static int mptcp_check_allowed_size(const struct mptcp_sock *msk, struct sock *ssk, u64 data_seq, int avail_size) { u64 window_end = mptcp_wnd_end(msk); u64 mptcp_snd_wnd; if (__mptcp_check_fallback(msk)) return avail_size; mptcp_snd_wnd = window_end - data_seq; avail_size = min_t(unsigned int, mptcp_snd_wnd, avail_size); if (unlikely(tcp_sk(ssk)->snd_wnd < mptcp_snd_wnd)) { tcp_sk(ssk)->snd_wnd = min_t(u64, U32_MAX, mptcp_snd_wnd); MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_SNDWNDSHARED); } return avail_size; } static bool __mptcp_add_ext(struct sk_buff *skb, gfp_t gfp) { struct skb_ext *mpext = __skb_ext_alloc(gfp); if (!mpext) return false; __skb_ext_set(skb, SKB_EXT_MPTCP, mpext); return true; } static struct sk_buff *__mptcp_do_alloc_tx_skb(struct sock *sk, gfp_t gfp) { struct sk_buff *skb; skb = alloc_skb_fclone(MAX_TCP_HEADER, gfp); if (likely(skb)) { if (likely(__mptcp_add_ext(skb, gfp))) { skb_reserve(skb, MAX_TCP_HEADER); skb->ip_summed = CHECKSUM_PARTIAL; INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); return skb; } __kfree_skb(skb); } else { mptcp_enter_memory_pressure(sk); } return NULL; } static struct sk_buff *__mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp) { struct sk_buff *skb; skb = __mptcp_do_alloc_tx_skb(sk, gfp); if (!skb) return NULL; if (likely(sk_wmem_schedule(ssk, skb->truesize))) { tcp_skb_entail(ssk, skb); return skb; } tcp_skb_tsorted_anchor_cleanup(skb); kfree_skb(skb); return NULL; } static struct sk_buff *mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, bool data_lock_held) { gfp_t gfp = data_lock_held ? GFP_ATOMIC : sk->sk_allocation; return __mptcp_alloc_tx_skb(sk, ssk, gfp); } /* note: this always recompute the csum on the whole skb, even * if we just appended a single frag. More status info needed */ static void mptcp_update_data_checksum(struct sk_buff *skb, int added) { struct mptcp_ext *mpext = mptcp_get_ext(skb); __wsum csum = ~csum_unfold(mpext->csum); int offset = skb->len - added; mpext->csum = csum_fold(csum_block_add(csum, skb_checksum(skb, offset, added, 0), offset)); } static void mptcp_update_infinite_map(struct mptcp_sock *msk, struct sock *ssk, struct mptcp_ext *mpext) { if (!mpext) return; mpext->infinite_map = 1; mpext->data_len = 0; if (!mptcp_try_fallback(ssk, MPTCP_MIB_INFINITEMAPTX)) { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_FALLBACKFAILED); mptcp_subflow_reset(ssk); return; } mptcp_subflow_ctx(ssk)->send_infinite_map = 0; } #define MPTCP_MAX_GSO_SIZE (GSO_LEGACY_MAX_SIZE - (MAX_TCP_HEADER + 1)) static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, struct mptcp_data_frag *dfrag, struct mptcp_sendmsg_info *info) { u64 data_seq = dfrag->data_seq + info->sent; int offset = dfrag->offset + info->sent; struct mptcp_sock *msk = mptcp_sk(sk); bool zero_window_probe = false; struct mptcp_ext *mpext = NULL; bool can_coalesce = false; bool reuse_skb = true; struct sk_buff *skb; size_t copy; int i; pr_debug("msk=%p ssk=%p sending dfrag at seq=%llu len=%u already sent=%u\n", msk, ssk, dfrag->data_seq, dfrag->data_len, info->sent); if (WARN_ON_ONCE(info->sent > info->limit || info->limit > dfrag->data_len)) return 0; if (unlikely(!__tcp_can_send(ssk))) return -EAGAIN; /* compute send limit */ if (unlikely(ssk->sk_gso_max_size > MPTCP_MAX_GSO_SIZE)) ssk->sk_gso_max_size = MPTCP_MAX_GSO_SIZE; info->mss_now = tcp_send_mss(ssk, &info->size_goal, info->flags); copy = info->size_goal; skb = tcp_write_queue_tail(ssk); if (skb && copy > skb->len) { /* Limit the write to the size available in the * current skb, if any, so that we create at most a new skb. * Explicitly tells TCP internals to avoid collapsing on later * queue management operation, to avoid breaking the ext <-> * SSN association set here */ mpext = mptcp_get_ext(skb); if (!mptcp_skb_can_collapse_to(data_seq, skb, mpext)) { TCP_SKB_CB(skb)->eor = 1; tcp_mark_push(tcp_sk(ssk), skb); goto alloc_skb; } i = skb_shinfo(skb)->nr_frags; can_coalesce = skb_can_coalesce(skb, i, dfrag->page, offset); if (!can_coalesce && i >= READ_ONCE(net_hotdata.sysctl_max_skb_frags)) { tcp_mark_push(tcp_sk(ssk), skb); goto alloc_skb; } copy -= skb->len; } else { alloc_skb: skb = mptcp_alloc_tx_skb(sk, ssk, info->data_lock_held); if (!skb) return -ENOMEM; i = skb_shinfo(skb)->nr_frags; reuse_skb = false; mpext = mptcp_get_ext(skb); } /* Zero window and all data acked? Probe. */ copy = mptcp_check_allowed_size(msk, ssk, data_seq, copy); if (copy == 0) { u64 snd_una = READ_ONCE(msk->snd_una); /* No need for zero probe if there are any data pending * either at the msk or ssk level; skb is the current write * queue tail and can be empty at this point. */ if (snd_una != msk->snd_nxt || skb->len || skb != tcp_send_head(ssk)) { tcp_remove_empty_skb(ssk); return 0; } zero_window_probe = true; data_seq = snd_una - 1; copy = 1; } copy = min_t(size_t, copy, info->limit - info->sent); if (!sk_wmem_schedule(ssk, copy)) { tcp_remove_empty_skb(ssk); return -ENOMEM; } if (can_coalesce) { skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); } else { get_page(dfrag->page); skb_fill_page_desc(skb, i, dfrag->page, offset, copy); } skb->len += copy; skb->data_len += copy; skb->truesize += copy; sk_wmem_queued_add(ssk, copy); sk_mem_charge(ssk, copy); WRITE_ONCE(tcp_sk(ssk)->write_seq, tcp_sk(ssk)->write_seq + copy); TCP_SKB_CB(skb)->end_seq += copy; tcp_skb_pcount_set(skb, 0); /* on skb reuse we just need to update the DSS len */ if (reuse_skb) { TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; mpext->data_len += copy; goto out; } memset(mpext, 0, sizeof(*mpext)); mpext->data_seq = data_seq; mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq; mpext->data_len = copy; mpext->use_map = 1; mpext->dsn64 = 1; pr_debug("data_seq=%llu subflow_seq=%u data_len=%u dsn64=%d\n", mpext->data_seq, mpext->subflow_seq, mpext->data_len, mpext->dsn64); if (zero_window_probe) { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_WINPROBE); mptcp_subflow_ctx(ssk)->rel_write_seq += copy; mpext->frozen = 1; if (READ_ONCE(msk->csum_enabled)) mptcp_update_data_checksum(skb, copy); tcp_push_pending_frames(ssk); return 0; } out: if (READ_ONCE(msk->csum_enabled)) mptcp_update_data_checksum(skb, copy); if (mptcp_subflow_ctx(ssk)->send_infinite_map) mptcp_update_infinite_map(msk, ssk, mpext); trace_mptcp_sendmsg_frag(mpext); mptcp_subflow_ctx(ssk)->rel_write_seq += copy; return copy; } #define MPTCP_SEND_BURST_SIZE ((1 << 16) - \ sizeof(struct tcphdr) - \ MAX_TCP_OPTION_SPACE - \ sizeof(struct ipv6hdr) - \ sizeof(struct frag_hdr)) struct subflow_send_info { struct sock *ssk; u64 linger_time; }; void mptcp_subflow_set_active(struct mptcp_subflow_context *subflow) { if (!subflow->stale) return; subflow->stale = 0; MPTCP_INC_STATS(sock_net(mptcp_subflow_tcp_sock(subflow)), MPTCP_MIB_SUBFLOWRECOVER); } bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) { if (unlikely(subflow->stale)) { u32 rcv_tstamp = READ_ONCE(tcp_sk(mptcp_subflow_tcp_sock(subflow))->rcv_tstamp); if (subflow->stale_rcv_tstamp == rcv_tstamp) return false; mptcp_subflow_set_active(subflow); } return __mptcp_subflow_active(subflow); } #define SSK_MODE_ACTIVE 0 #define SSK_MODE_BACKUP 1 #define SSK_MODE_MAX 2 /* implement the mptcp packet scheduler; * returns the subflow that will transmit the next DSS * additionally updates the rtx timeout */ struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) { struct subflow_send_info send_info[SSK_MODE_MAX]; struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; u32 pace, burst, wmem; int i, nr_active = 0; struct sock *ssk; u64 linger_time; long tout = 0; /* pick the subflow with the lower wmem/wspace ratio */ for (i = 0; i < SSK_MODE_MAX; ++i) { send_info[i].ssk = NULL; send_info[i].linger_time = -1; } mptcp_for_each_subflow(msk, subflow) { bool backup = subflow->backup || subflow->request_bkup; trace_mptcp_subflow_get_send(subflow); ssk = mptcp_subflow_tcp_sock(subflow); if (!mptcp_subflow_active(subflow)) continue; tout = max(tout, mptcp_timeout_from_subflow(subflow)); nr_active += !backup; pace = subflow->avg_pacing_rate; if (unlikely(!pace)) { /* init pacing rate from socket */ subflow->avg_pacing_rate = READ_ONCE(ssk->sk_pacing_rate); pace = subflow->avg_pacing_rate; if (!pace) continue; } linger_time = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32, pace); if (linger_time < send_info[backup].linger_time) { send_info[backup].ssk = ssk; send_info[backup].linger_time = linger_time; } } __mptcp_set_timeout(sk, tout); /* pick the best backup if no other subflow is active */ if (!nr_active) send_info[SSK_MODE_ACTIVE].ssk = send_info[SSK_MODE_BACKUP].ssk; /* According to the blest algorithm, to avoid HoL blocking for the * faster flow, we need to: * - estimate the faster flow linger time * - use the above to estimate the amount of byte transferred * by the faster flow * - check that the amount of queued data is greater than the above, * otherwise do not use the picked, slower, subflow * We select the subflow with the shorter estimated time to flush * the queued mem, which basically ensure the above. We just need * to check that subflow has a non empty cwin. */ ssk = send_info[SSK_MODE_ACTIVE].ssk; if (!ssk || !sk_stream_memory_free(ssk)) return NULL; burst = min_t(int, MPTCP_SEND_BURST_SIZE, mptcp_wnd_end(msk) - msk->snd_nxt); wmem = READ_ONCE(ssk->sk_wmem_queued); if (!burst) return ssk; subflow = mptcp_subflow_ctx(ssk); subflow->avg_pacing_rate = div_u64((u64)subflow->avg_pacing_rate * wmem + READ_ONCE(ssk->sk_pacing_rate) * burst, burst + wmem); msk->snd_burst = burst; return ssk; } static void mptcp_push_release(struct sock *ssk, struct mptcp_sendmsg_info *info) { tcp_push(ssk, 0, info->mss_now, tcp_sk(ssk)->nonagle, info->size_goal); release_sock(ssk); } static void mptcp_update_post_push(struct mptcp_sock *msk, struct mptcp_data_frag *dfrag, u32 sent) { u64 snd_nxt_new = dfrag->data_seq; dfrag->already_sent += sent; msk->snd_burst -= sent; snd_nxt_new += dfrag->already_sent; /* snd_nxt_new can be smaller than snd_nxt in case mptcp * is recovering after a failover. In that event, this re-sends * old segments. * * Thus compute snd_nxt_new candidate based on * the dfrag->data_seq that was sent and the data * that has been handed to the subflow for transmission * and skip update in case it was old dfrag. */ if (likely(after64(snd_nxt_new, msk->snd_nxt))) { msk->bytes_sent += snd_nxt_new - msk->snd_nxt; WRITE_ONCE(msk->snd_nxt, snd_nxt_new); } } void mptcp_check_and_set_pending(struct sock *sk) { if (mptcp_send_head(sk)) { mptcp_data_lock(sk); mptcp_sk(sk)->cb_flags |= BIT(MPTCP_PUSH_PENDING); mptcp_data_unlock(sk); } } static int __subflow_push_pending(struct sock *sk, struct sock *ssk, struct mptcp_sendmsg_info *info) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_data_frag *dfrag; int len, copied = 0, err = 0; while ((dfrag = mptcp_send_head(sk))) { info->sent = dfrag->already_sent; info->limit = dfrag->data_len; len = dfrag->data_len - dfrag->already_sent; while (len > 0) { int ret = 0; ret = mptcp_sendmsg_frag(sk, ssk, dfrag, info); if (ret <= 0) { err = copied ? : ret; goto out; } info->sent += ret; copied += ret; len -= ret; mptcp_update_post_push(msk, dfrag, ret); } msk->first_pending = mptcp_send_next(sk); if (msk->snd_burst <= 0 || !sk_stream_memory_free(ssk) || !mptcp_subflow_active(mptcp_subflow_ctx(ssk))) { err = copied; goto out; } mptcp_set_timeout(sk); } err = copied; out: if (err > 0) msk->last_data_sent = tcp_jiffies32; return err; } void __mptcp_push_pending(struct sock *sk, unsigned int flags) { struct sock *prev_ssk = NULL, *ssk = NULL; struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_sendmsg_info info = { .flags = flags, }; bool copied = false; int push_count = 1; while (mptcp_send_head(sk) && (push_count > 0)) { struct mptcp_subflow_context *subflow; int ret = 0; if (mptcp_sched_get_send(msk)) break; push_count = 0; mptcp_for_each_subflow(msk, subflow) { if (READ_ONCE(subflow->scheduled)) { mptcp_subflow_set_scheduled(subflow, false); prev_ssk = ssk; ssk = mptcp_subflow_tcp_sock(subflow); if (ssk != prev_ssk) { /* First check. If the ssk has changed since * the last round, release prev_ssk */ if (prev_ssk) mptcp_push_release(prev_ssk, &info); /* Need to lock the new subflow only if different * from the previous one, otherwise we are still * helding the relevant lock */ lock_sock(ssk); } push_count++; ret = __subflow_push_pending(sk, ssk, &info); if (ret <= 0) { if (ret != -EAGAIN || (1 << ssk->sk_state) & (TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSE)) push_count--; continue; } copied = true; } } } /* at this point we held the socket lock for the last subflow we used */ if (ssk) mptcp_push_release(ssk, &info); /* Avoid scheduling the rtx timer if no data has been pushed; the timer * will be updated on positive acks by __mptcp_cleanup_una(). */ if (copied) { if (!mptcp_rtx_timer_pending(sk)) mptcp_reset_rtx_timer(sk); mptcp_check_send_data_fin(sk); } } static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk, bool first) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_sendmsg_info info = { .data_lock_held = true, }; bool keep_pushing = true; struct sock *xmit_ssk; int copied = 0; info.flags = 0; while (mptcp_send_head(sk) && keep_pushing) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); int ret = 0; /* check for a different subflow usage only after * spooling the first chunk of data */ if (first) { mptcp_subflow_set_scheduled(subflow, false); ret = __subflow_push_pending(sk, ssk, &info); first = false; if (ret <= 0) break; copied += ret; continue; } if (mptcp_sched_get_send(msk)) goto out; if (READ_ONCE(subflow->scheduled)) { mptcp_subflow_set_scheduled(subflow, false); ret = __subflow_push_pending(sk, ssk, &info); if (ret <= 0) keep_pushing = false; copied += ret; } mptcp_for_each_subflow(msk, subflow) { if (READ_ONCE(subflow->scheduled)) { xmit_ssk = mptcp_subflow_tcp_sock(subflow); if (xmit_ssk != ssk) { mptcp_subflow_delegate(subflow, MPTCP_DELEGATE_SEND); keep_pushing = false; } } } } out: /* __mptcp_alloc_tx_skb could have released some wmem and we are * not going to flush it via release_sock() */ if (copied) { tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, info.size_goal); if (!mptcp_rtx_timer_pending(sk)) mptcp_reset_rtx_timer(sk); if (msk->snd_data_fin_enable && msk->snd_nxt + 1 == msk->write_seq) mptcp_schedule_work(sk); } } static int mptcp_disconnect(struct sock *sk, int flags); static int mptcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, size_t len, int *copied_syn) { unsigned int saved_flags = msg->msg_flags; struct mptcp_sock *msk = mptcp_sk(sk); struct sock *ssk; int ret; /* on flags based fastopen the mptcp is supposed to create the * first subflow right now. Otherwise we are in the defer_connect * path, and the first subflow must be already present. * Since the defer_connect flag is cleared after the first succsful * fastopen attempt, no need to check for additional subflow status. */ if (msg->msg_flags & MSG_FASTOPEN) { ssk = __mptcp_nmpc_sk(msk); if (IS_ERR(ssk)) return PTR_ERR(ssk); } if (!msk->first) return -EINVAL; ssk = msk->first; lock_sock(ssk); msg->msg_flags |= MSG_DONTWAIT; msk->fastopening = 1; ret = tcp_sendmsg_fastopen(ssk, msg, copied_syn, len, NULL); msk->fastopening = 0; msg->msg_flags = saved_flags; release_sock(ssk); /* do the blocking bits of inet_stream_connect outside the ssk socket lock */ if (ret == -EINPROGRESS && !(msg->msg_flags & MSG_DONTWAIT)) { ret = __inet_stream_connect(sk->sk_socket, msg->msg_name, msg->msg_namelen, msg->msg_flags, 1); /* Keep the same behaviour of plain TCP: zero the copied bytes in * case of any error, except timeout or signal */ if (ret && ret != -EINPROGRESS && ret != -ERESTARTSYS && ret != -EINTR) *copied_syn = 0; } else if (ret && ret != -EINPROGRESS) { /* The disconnect() op called by tcp_sendmsg_fastopen()/ * __inet_stream_connect() can fail, due to looking check, * see mptcp_disconnect(). * Attempt it again outside the problematic scope. */ if (!mptcp_disconnect(sk, 0)) { sk->sk_disconnects++; sk->sk_socket->state = SS_UNCONNECTED; } } inet_clear_bit(DEFER_CONNECT, sk); return ret; } static int do_copy_data_nocache(struct sock *sk, int copy, struct iov_iter *from, char *to) { if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) { if (!copy_from_iter_full_nocache(to, copy, from)) return -EFAULT; } else if (!copy_from_iter_full(to, copy, from)) { return -EFAULT; } return 0; } /* open-code sk_stream_memory_free() plus sent limit computation to * avoid indirect calls in fast-path. * Called under the msk socket lock, so we can avoid a bunch of ONCE * annotations. */ static u32 mptcp_send_limit(const struct sock *sk) { const struct mptcp_sock *msk = mptcp_sk(sk); u32 limit, not_sent; if (sk->sk_wmem_queued >= READ_ONCE(sk->sk_sndbuf)) return 0; limit = mptcp_notsent_lowat(sk); if (limit == UINT_MAX) return UINT_MAX; not_sent = msk->write_seq - msk->snd_nxt; if (not_sent >= limit) return 0; return limit - not_sent; } static void mptcp_rps_record_subflows(const struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; if (!rfs_is_needed()) return; mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); sock_rps_record_flow(ssk); } } static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct mptcp_sock *msk = mptcp_sk(sk); struct page_frag *pfrag; size_t copied = 0; int ret = 0; long timeo; /* silently ignore everything else */ msg->msg_flags &= MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_FASTOPEN; lock_sock(sk); mptcp_rps_record_subflows(msk); if (unlikely(inet_test_bit(DEFER_CONNECT, sk) || msg->msg_flags & MSG_FASTOPEN)) { int copied_syn = 0; ret = mptcp_sendmsg_fastopen(sk, msg, len, &copied_syn); copied += copied_syn; if (ret == -EINPROGRESS && copied_syn > 0) goto out; else if (ret) goto do_error; } timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) { ret = sk_stream_wait_connect(sk, &timeo); if (ret) goto do_error; } ret = -EPIPE; if (unlikely(sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))) goto do_error; pfrag = sk_page_frag(sk); while (msg_data_left(msg)) { int total_ts, frag_truesize = 0; struct mptcp_data_frag *dfrag; bool dfrag_collapsed; size_t psize, offset; u32 copy_limit; /* ensure fitting the notsent_lowat() constraint */ copy_limit = mptcp_send_limit(sk); if (!copy_limit) goto wait_for_memory; /* reuse tail pfrag, if possible, or carve a new one from the * page allocator */ dfrag = mptcp_pending_tail(sk); dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag); if (!dfrag_collapsed) { if (!mptcp_page_frag_refill(sk, pfrag)) goto wait_for_memory; dfrag = mptcp_carve_data_frag(msk, pfrag, pfrag->offset); frag_truesize = dfrag->overhead; } /* we do not bound vs wspace, to allow a single packet. * memory accounting will prevent execessive memory usage * anyway */ offset = dfrag->offset + dfrag->data_len; psize = pfrag->size - offset; psize = min_t(size_t, psize, msg_data_left(msg)); psize = min_t(size_t, psize, copy_limit); total_ts = psize + frag_truesize; if (!sk_wmem_schedule(sk, total_ts)) goto wait_for_memory; ret = do_copy_data_nocache(sk, psize, &msg->msg_iter, page_address(dfrag->page) + offset); if (ret) goto do_error; /* data successfully copied into the write queue */ sk_forward_alloc_add(sk, -total_ts); copied += psize; dfrag->data_len += psize; frag_truesize += psize; pfrag->offset += frag_truesize; WRITE_ONCE(msk->write_seq, msk->write_seq + psize); /* charge data on mptcp pending queue to the msk socket * Note: we charge such data both to sk and ssk */ sk_wmem_queued_add(sk, frag_truesize); if (!dfrag_collapsed) { get_page(dfrag->page); list_add_tail(&dfrag->list, &msk->rtx_queue); if (!msk->first_pending) msk->first_pending = dfrag; } pr_debug("msk=%p dfrag at seq=%llu len=%u sent=%u new=%d\n", msk, dfrag->data_seq, dfrag->data_len, dfrag->already_sent, !dfrag_collapsed); continue; wait_for_memory: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); __mptcp_push_pending(sk, msg->msg_flags); ret = sk_stream_wait_memory(sk, &timeo); if (ret) goto do_error; } if (copied) __mptcp_push_pending(sk, msg->msg_flags); out: release_sock(sk); return copied; do_error: if (copied) goto out; copied = sk_stream_error(sk, msg->msg_flags, ret); goto out; } static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied); static int __mptcp_recvmsg_mskq(struct sock *sk, struct msghdr *msg, size_t len, int flags, int copied_total, struct scm_timestamping_internal *tss, int *cmsg_flags) { struct mptcp_sock *msk = mptcp_sk(sk); struct sk_buff *skb, *tmp; int total_data_len = 0; int copied = 0; skb_queue_walk_safe(&sk->sk_receive_queue, skb, tmp) { u32 delta, offset = MPTCP_SKB_CB(skb)->offset; u32 data_len = skb->len - offset; u32 count; int err; if (flags & MSG_PEEK) { /* skip already peeked skbs */ if (total_data_len + data_len <= copied_total) { total_data_len += data_len; continue; } /* skip the already peeked data in the current skb */ delta = copied_total - total_data_len; offset += delta; data_len -= delta; } count = min_t(size_t, len - copied, data_len); if (!(flags & MSG_TRUNC)) { err = skb_copy_datagram_msg(skb, offset, msg, count); if (unlikely(err < 0)) { if (!copied) return err; break; } } if (MPTCP_SKB_CB(skb)->has_rxtstamp) { tcp_update_recv_tstamps(skb, tss); *cmsg_flags |= MPTCP_CMSG_TS; } copied += count; if (!(flags & MSG_PEEK)) { msk->bytes_consumed += count; if (count < data_len) { MPTCP_SKB_CB(skb)->offset += count; MPTCP_SKB_CB(skb)->map_seq += count; break; } /* avoid the indirect call, we know the destructor is sock_rfree */ skb->destructor = NULL; skb->sk = NULL; atomic_sub(skb->truesize, &sk->sk_rmem_alloc); sk_mem_uncharge(sk, skb->truesize); __skb_unlink(skb, &sk->sk_receive_queue); skb_attempt_defer_free(skb); } if (copied >= len) break; } mptcp_rcv_space_adjust(msk, copied); return copied; } /* receive buffer autotuning. See tcp_rcv_space_adjust for more information. * * Only difference: Use highest rtt estimate of the subflows in use. */ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) { struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; u8 scaling_ratio = U8_MAX; u32 time, advmss = 1; u64 rtt_us, mstamp; msk_owned_by_me(msk); if (copied <= 0) return; if (!msk->rcvspace_init) mptcp_rcv_space_init(msk, msk->first); msk->rcvq_space.copied += copied; mstamp = div_u64(tcp_clock_ns(), NSEC_PER_USEC); time = tcp_stamp_us_delta(mstamp, msk->rcvq_space.time); rtt_us = msk->rcvq_space.rtt_us; if (rtt_us && time < (rtt_us >> 3)) return; rtt_us = 0; mptcp_for_each_subflow(msk, subflow) { const struct tcp_sock *tp; u64 sf_rtt_us; u32 sf_advmss; tp = tcp_sk(mptcp_subflow_tcp_sock(subflow)); sf_rtt_us = READ_ONCE(tp->rcv_rtt_est.rtt_us); sf_advmss = READ_ONCE(tp->advmss); rtt_us = max(sf_rtt_us, rtt_us); advmss = max(sf_advmss, advmss); scaling_ratio = min(tp->scaling_ratio, scaling_ratio); } msk->rcvq_space.rtt_us = rtt_us; msk->scaling_ratio = scaling_ratio; if (time < (rtt_us >> 3) || rtt_us == 0) return; if (msk->rcvq_space.copied <= msk->rcvq_space.space) goto new_measure; if (mptcp_rcvbuf_grow(sk, msk->rcvq_space.copied)) { /* Make subflows follow along. If we do not do this, we * get drops at subflow level if skbs can't be moved to * the mptcp rx queue fast enough (announced rcv_win can * exceed ssk->sk_rcvbuf). */ mptcp_for_each_subflow(msk, subflow) { struct sock *ssk; bool slow; ssk = mptcp_subflow_tcp_sock(subflow); slow = lock_sock_fast(ssk); /* subflows can be added before tcp_init_transfer() */ if (tcp_sk(ssk)->rcvq_space.space) tcp_rcvbuf_grow(ssk, msk->rcvq_space.copied); unlock_sock_fast(ssk, slow); } } new_measure: msk->rcvq_space.copied = 0; msk->rcvq_space.time = mstamp; } static bool __mptcp_move_skbs(struct sock *sk, struct list_head *skbs, u32 *delta) { struct sk_buff *skb = list_first_entry(skbs, struct sk_buff, list); struct mptcp_sock *msk = mptcp_sk(sk); bool moved = false; *delta = 0; while (1) { /* If the msk recvbuf is full stop, don't drop */ if (sk_rmem_alloc_get(sk) > sk->sk_rcvbuf) break; prefetch(skb->next); list_del(&skb->list); *delta += skb->truesize; moved |= __mptcp_move_skb(sk, skb); if (list_empty(skbs)) break; skb = list_first_entry(skbs, struct sk_buff, list); } __mptcp_ofo_queue(msk); if (moved) mptcp_check_data_fin((struct sock *)msk); return moved; } static bool mptcp_can_spool_backlog(struct sock *sk, struct list_head *skbs) { struct mptcp_sock *msk = mptcp_sk(sk); /* After CG initialization, subflows should never add skb before * gaining the CG themself. */ DEBUG_NET_WARN_ON_ONCE(msk->backlog_unaccounted && sk->sk_socket && mem_cgroup_from_sk(sk)); /* Don't spool the backlog if the rcvbuf is full. */ if (list_empty(&msk->backlog_list) || sk_rmem_alloc_get(sk) > sk->sk_rcvbuf) return false; INIT_LIST_HEAD(skbs); list_splice_init(&msk->backlog_list, skbs); return true; } static void mptcp_backlog_spooled(struct sock *sk, u32 moved, struct list_head *skbs) { struct mptcp_sock *msk = mptcp_sk(sk); WRITE_ONCE(msk->backlog_len, msk->backlog_len - moved); list_splice(skbs, &msk->backlog_list); } static bool mptcp_move_skbs(struct sock *sk) { struct list_head skbs; bool enqueued = false; u32 moved; mptcp_data_lock(sk); while (mptcp_can_spool_backlog(sk, &skbs)) { mptcp_data_unlock(sk); enqueued |= __mptcp_move_skbs(sk, &skbs, &moved); mptcp_data_lock(sk); mptcp_backlog_spooled(sk, moved, &skbs); } mptcp_data_unlock(sk); return enqueued; } static unsigned int mptcp_inq_hint(const struct sock *sk) { const struct mptcp_sock *msk = mptcp_sk(sk); const struct sk_buff *skb; skb = skb_peek(&sk->sk_receive_queue); if (skb) { u64 hint_val = READ_ONCE(msk->ack_seq) - MPTCP_SKB_CB(skb)->map_seq; if (hint_val >= INT_MAX) return INT_MAX; return (unsigned int)hint_val; } if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN)) return 1; return 0; } static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { struct mptcp_sock *msk = mptcp_sk(sk); struct scm_timestamping_internal tss; int copied = 0, cmsg_flags = 0; int target; long timeo; /* MSG_ERRQUEUE is really a no-op till we support IP_RECVERR */ if (unlikely(flags & MSG_ERRQUEUE)) return inet_recv_error(sk, msg, len, addr_len); lock_sock(sk); if (unlikely(sk->sk_state == TCP_LISTEN)) { copied = -ENOTCONN; goto out_err; } mptcp_rps_record_subflows(msk); timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); len = min_t(size_t, len, INT_MAX); target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); if (unlikely(msk->recvmsg_inq)) cmsg_flags = MPTCP_CMSG_INQ; while (copied < len) { int err, bytes_read; bytes_read = __mptcp_recvmsg_mskq(sk, msg, len - copied, flags, copied, &tss, &cmsg_flags); if (unlikely(bytes_read < 0)) { if (!copied) copied = bytes_read; goto out_err; } copied += bytes_read; if (!list_empty(&msk->backlog_list) && mptcp_move_skbs(sk)) continue; /* only the MPTCP socket status is relevant here. The exit * conditions mirror closely tcp_recvmsg() */ if (copied >= target) break; if (copied) { if (sk->sk_err || sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN) || !timeo || signal_pending(current)) break; } else { if (sk->sk_err) { copied = sock_error(sk); break; } if (sk->sk_shutdown & RCV_SHUTDOWN) break; if (sk->sk_state == TCP_CLOSE) { copied = -ENOTCONN; break; } if (!timeo) { copied = -EAGAIN; break; } if (signal_pending(current)) { copied = sock_intr_errno(timeo); break; } } pr_debug("block timeout %ld\n", timeo); mptcp_cleanup_rbuf(msk, copied); err = sk_wait_data(sk, &timeo, NULL); if (err < 0) { err = copied ? : err; goto out_err; } } mptcp_cleanup_rbuf(msk, copied); out_err: if (cmsg_flags && copied >= 0) { if (cmsg_flags & MPTCP_CMSG_TS) tcp_recv_timestamp(msg, sk, &tss); if (cmsg_flags & MPTCP_CMSG_INQ) { unsigned int inq = mptcp_inq_hint(sk); put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq); } } pr_debug("msk=%p rx queue empty=%d copied=%d\n", msk, skb_queue_empty(&sk->sk_receive_queue), copied); release_sock(sk); return copied; } static void mptcp_retransmit_timer(struct timer_list *t) { struct sock *sk = timer_container_of(sk, t, mptcp_retransmit_timer); struct mptcp_sock *msk = mptcp_sk(sk); bh_lock_sock(sk); if (!sock_owned_by_user(sk)) { /* we need a process context to retransmit */ if (!test_and_set_bit(MPTCP_WORK_RTX, &msk->flags)) mptcp_schedule_work(sk); } else { /* delegate our work to tcp_release_cb() */ __set_bit(MPTCP_RETRANSMIT, &msk->cb_flags); } bh_unlock_sock(sk); sock_put(sk); } static void mptcp_tout_timer(struct timer_list *t) { struct inet_connection_sock *icsk = timer_container_of(icsk, t, mptcp_tout_timer); struct sock *sk = &icsk->icsk_inet.sk; mptcp_schedule_work(sk); sock_put(sk); } /* Find an idle subflow. Return NULL if there is unacked data at tcp * level. * * A backup subflow is returned only if that is the only kind available. */ struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk) { struct sock *backup = NULL, *pick = NULL; struct mptcp_subflow_context *subflow; int min_stale_count = INT_MAX; mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); if (!__mptcp_subflow_active(subflow)) continue; /* still data outstanding at TCP level? skip this */ if (!tcp_rtx_and_write_queues_empty(ssk)) { mptcp_pm_subflow_chk_stale(msk, ssk); min_stale_count = min_t(int, min_stale_count, subflow->stale_count); continue; } if (subflow->backup || subflow->request_bkup) { if (!backup) backup = ssk; continue; } if (!pick) pick = ssk; } if (pick) return pick; /* use backup only if there are no progresses anywhere */ return min_stale_count > 1 ? backup : NULL; } bool __mptcp_retransmit_pending_data(struct sock *sk) { struct mptcp_data_frag *cur, *rtx_head; struct mptcp_sock *msk = mptcp_sk(sk); if (__mptcp_check_fallback(msk)) return false; /* the closing socket has some data untransmitted and/or unacked: * some data in the mptcp rtx queue has not really xmitted yet. * keep it simple and re-inject the whole mptcp level rtx queue */ mptcp_data_lock(sk); __mptcp_clean_una_wakeup(sk); rtx_head = mptcp_rtx_head(sk); if (!rtx_head) { mptcp_data_unlock(sk); return false; } msk->recovery_snd_nxt = msk->snd_nxt; msk->recovery = true; mptcp_data_unlock(sk); msk->first_pending = rtx_head; msk->snd_burst = 0; /* be sure to clear the "sent status" on all re-injected fragments */ list_for_each_entry(cur, &msk->rtx_queue, list) { if (!cur->already_sent) break; cur->already_sent = 0; } return true; } /* flags for __mptcp_close_ssk() */ #define MPTCP_CF_PUSH BIT(1) /* be sure to send a reset only if the caller asked for it, also * clean completely the subflow status when the subflow reaches * TCP_CLOSE state */ static void __mptcp_subflow_disconnect(struct sock *ssk, struct mptcp_subflow_context *subflow, bool fastclosing) { if (((1 << ssk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) || fastclosing) { /* The MPTCP code never wait on the subflow sockets, TCP-level * disconnect should never fail */ WARN_ON_ONCE(tcp_disconnect(ssk, 0)); mptcp_subflow_ctx_reset(subflow); } else { tcp_shutdown(ssk, SEND_SHUTDOWN); } } /* subflow sockets can be either outgoing (connect) or incoming * (accept). * * Outgoing subflows use in-kernel sockets. * Incoming subflows do not have their own 'struct socket' allocated, * so we need to use tcp_close() after detaching them from the mptcp * parent socket. */ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, struct mptcp_subflow_context *subflow, unsigned int flags) { struct mptcp_sock *msk = mptcp_sk(sk); bool dispose_it, need_push = false; int fwd_remaining; /* Do not pass RX data to the msk, even if the subflow socket is not * going to be freed (i.e. even for the first subflow on graceful * subflow close. */ lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); subflow->closing = 1; /* Borrow the fwd allocated page left-over; fwd memory for the subflow * could be negative at this point, but will be reach zero soon - when * the data allocated using such fragment will be freed. */ if (subflow->lent_mem_frag) { fwd_remaining = PAGE_SIZE - subflow->lent_mem_frag; sk_forward_alloc_add(sk, fwd_remaining); sk_forward_alloc_add(ssk, -fwd_remaining); subflow->lent_mem_frag = 0; } /* If the first subflow moved to a close state before accept, e.g. due * to an incoming reset or listener shutdown, the subflow socket is * already deleted by inet_child_forget() and the mptcp socket can't * survive too. */ if (msk->in_accept_queue && msk->first == ssk && (sock_flag(sk, SOCK_DEAD) || sock_flag(ssk, SOCK_DEAD))) { /* ensure later check in mptcp_worker() will dispose the msk */ sock_set_flag(sk, SOCK_DEAD); mptcp_set_close_tout(sk, tcp_jiffies32 - (mptcp_close_timeout(sk) + 1)); mptcp_subflow_drop_ctx(ssk); goto out_release; } dispose_it = msk->free_first || ssk != msk->first; if (dispose_it) list_del(&subflow->node); if (subflow->send_fastclose && ssk->sk_state != TCP_CLOSE) tcp_set_state(ssk, TCP_CLOSE); need_push = (flags & MPTCP_CF_PUSH) && __mptcp_retransmit_pending_data(sk); if (!dispose_it) { __mptcp_subflow_disconnect(ssk, subflow, msk->fastclosing); release_sock(ssk); goto out; } subflow->disposable = 1; /* if ssk hit tcp_done(), tcp_cleanup_ulp() cleared the related ops * the ssk has been already destroyed, we just need to release the * reference owned by msk; */ if (!inet_csk(ssk)->icsk_ulp_ops) { WARN_ON_ONCE(!sock_flag(ssk, SOCK_DEAD)); kfree_rcu(subflow, rcu); } else { /* otherwise tcp will dispose of the ssk and subflow ctx */ __tcp_close(ssk, 0); /* close acquired an extra ref */ __sock_put(ssk); } out_release: __mptcp_subflow_error_report(sk, ssk); release_sock(ssk); sock_put(ssk); if (ssk == msk->first) WRITE_ONCE(msk->first, NULL); out: __mptcp_sync_sndbuf(sk); if (need_push) __mptcp_push_pending(sk, 0); /* Catch every 'all subflows closed' scenario, including peers silently * closing them, e.g. due to timeout. * For established sockets, allow an additional timeout before closing, * as the protocol can still create more subflows. */ if (list_is_singular(&msk->conn_list) && msk->first && inet_sk_state_load(msk->first) == TCP_CLOSE) { if (sk->sk_state != TCP_ESTABLISHED || msk->in_accept_queue || sock_flag(sk, SOCK_DEAD)) { mptcp_set_state(sk, TCP_CLOSE); mptcp_close_wake_up(sk); } else { mptcp_start_tout_timer(sk); } } } void mptcp_close_ssk(struct sock *sk, struct sock *ssk, struct mptcp_subflow_context *subflow) { struct mptcp_sock *msk = mptcp_sk(sk); struct sk_buff *skb; /* The first subflow can already be closed and still in the list */ if (subflow->close_event_done) return; subflow->close_event_done = true; if (sk->sk_state == TCP_ESTABLISHED) mptcp_event(MPTCP_EVENT_SUB_CLOSED, mptcp_sk(sk), ssk, GFP_KERNEL); /* Remove any reference from the backlog to this ssk; backlog skbs consume * space in the msk receive queue, no need to touch sk->sk_rmem_alloc */ list_for_each_entry(skb, &msk->backlog_list, list) { if (skb->sk != ssk) continue; atomic_sub(skb->truesize, &skb->sk->sk_rmem_alloc); skb->sk = NULL; } /* subflow aborted before reaching the fully_established status * attempt the creation of the next subflow */ mptcp_pm_subflow_check_next(mptcp_sk(sk), subflow); __mptcp_close_ssk(sk, ssk, subflow, MPTCP_CF_PUSH); } static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu) { return 0; } static void __mptcp_close_subflow(struct sock *sk) { struct mptcp_subflow_context *subflow, *tmp; struct mptcp_sock *msk = mptcp_sk(sk); might_sleep(); mptcp_for_each_subflow_safe(msk, subflow, tmp) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); int ssk_state = inet_sk_state_load(ssk); if (ssk_state != TCP_CLOSE && (ssk_state != TCP_CLOSE_WAIT || inet_sk_state_load(sk) != TCP_ESTABLISHED || __mptcp_check_fallback(msk))) continue; /* 'subflow_data_ready' will re-sched once rx queue is empty */ if (!skb_queue_empty_lockless(&ssk->sk_receive_queue)) continue; mptcp_close_ssk(sk, ssk, subflow); } } static bool mptcp_close_tout_expired(const struct sock *sk) { if (!inet_csk(sk)->icsk_mtup.probe_timestamp || sk->sk_state == TCP_CLOSE) return false; return time_after32(tcp_jiffies32, inet_csk(sk)->icsk_mtup.probe_timestamp + mptcp_close_timeout(sk)); } static void mptcp_check_fastclose(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow, *tmp; struct sock *sk = (struct sock *)msk; if (likely(!READ_ONCE(msk->rcv_fastclose))) return; mptcp_token_destroy(msk); mptcp_for_each_subflow_safe(msk, subflow, tmp) { struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); bool slow; slow = lock_sock_fast(tcp_sk); if (tcp_sk->sk_state != TCP_CLOSE) { mptcp_send_active_reset_reason(tcp_sk); tcp_set_state(tcp_sk, TCP_CLOSE); } unlock_sock_fast(tcp_sk, slow); } /* Mirror the tcp_reset() error propagation */ switch (sk->sk_state) { case TCP_SYN_SENT: WRITE_ONCE(sk->sk_err, ECONNREFUSED); break; case TCP_CLOSE_WAIT: WRITE_ONCE(sk->sk_err, EPIPE); break; case TCP_CLOSE: return; default: WRITE_ONCE(sk->sk_err, ECONNRESET); } mptcp_set_state(sk, TCP_CLOSE); WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags); /* the calling mptcp_worker will properly destroy the socket */ if (sock_flag(sk, SOCK_DEAD)) return; sk->sk_state_change(sk); sk_error_report(sk); } static void __mptcp_retrans(struct sock *sk) { struct mptcp_sendmsg_info info = { .data_lock_held = true, }; struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_subflow_context *subflow; struct mptcp_data_frag *dfrag; struct sock *ssk; int ret, err; u16 len = 0; mptcp_clean_una_wakeup(sk); /* first check ssk: need to kick "stale" logic */ err = mptcp_sched_get_retrans(msk); dfrag = mptcp_rtx_head(sk); if (!dfrag) { if (mptcp_data_fin_enabled(msk)) { struct inet_connection_sock *icsk = inet_csk(sk); WRITE_ONCE(icsk->icsk_retransmits, icsk->icsk_retransmits + 1); mptcp_set_datafin_timeout(sk); mptcp_send_ack(msk); goto reset_timer; } if (!mptcp_send_head(sk)) goto clear_scheduled; goto reset_timer; } if (err) goto reset_timer; mptcp_for_each_subflow(msk, subflow) { if (READ_ONCE(subflow->scheduled)) { u16 copied = 0; mptcp_subflow_set_scheduled(subflow, false); ssk = mptcp_subflow_tcp_sock(subflow); lock_sock(ssk); /* limit retransmission to the bytes already sent on some subflows */ info.sent = 0; info.limit = READ_ONCE(msk->csum_enabled) ? dfrag->data_len : dfrag->already_sent; /* * make the whole retrans decision, xmit, disallow * fallback atomic, note that we can't retrans even * when an infinite fallback is in progress, i.e. new * subflows are disallowed. */ spin_lock_bh(&msk->fallback_lock); if (__mptcp_check_fallback(msk) || !msk->allow_subflows) { spin_unlock_bh(&msk->fallback_lock); release_sock(ssk); goto clear_scheduled; } while (info.sent < info.limit) { ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); if (ret <= 0) break; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS); copied += ret; info.sent += ret; } if (copied) { len = max(copied, len); tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, info.size_goal); msk->allow_infinite_fallback = false; } spin_unlock_bh(&msk->fallback_lock); release_sock(ssk); } } msk->bytes_retrans += len; dfrag->already_sent = max(dfrag->already_sent, len); reset_timer: mptcp_check_and_set_pending(sk); if (!mptcp_rtx_timer_pending(sk)) mptcp_reset_rtx_timer(sk); clear_scheduled: /* If no rtx data was available or in case of fallback, there * could be left-over scheduled subflows; clear them all * or later xmit could use bad ones */ mptcp_for_each_subflow(msk, subflow) if (READ_ONCE(subflow->scheduled)) mptcp_subflow_set_scheduled(subflow, false); } /* schedule the timeout timer for the relevant event: either close timeout * or mp_fail timeout. The close timeout takes precedence on the mp_fail one */ void mptcp_reset_tout_timer(struct mptcp_sock *msk, unsigned long fail_tout) { struct sock *sk = (struct sock *)msk; unsigned long timeout, close_timeout; if (!fail_tout && !inet_csk(sk)->icsk_mtup.probe_timestamp) return; close_timeout = (unsigned long)inet_csk(sk)->icsk_mtup.probe_timestamp - tcp_jiffies32 + jiffies + mptcp_close_timeout(sk); /* the close timeout takes precedence on the fail one, and here at least one of * them is active */ timeout = inet_csk(sk)->icsk_mtup.probe_timestamp ? close_timeout : fail_tout; sk_reset_timer(sk, &inet_csk(sk)->mptcp_tout_timer, timeout); } static void mptcp_mp_fail_no_response(struct mptcp_sock *msk) { struct sock *ssk = msk->first; bool slow; if (!ssk) return; pr_debug("MP_FAIL doesn't respond, reset the subflow\n"); slow = lock_sock_fast(ssk); mptcp_subflow_reset(ssk); WRITE_ONCE(mptcp_subflow_ctx(ssk)->fail_tout, 0); unlock_sock_fast(ssk, slow); } static void mptcp_backlog_purge(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); struct sk_buff *tmp, *skb; LIST_HEAD(backlog); mptcp_data_lock(sk); list_splice_init(&msk->backlog_list, &backlog); msk->backlog_len = 0; mptcp_data_unlock(sk); list_for_each_entry_safe(skb, tmp, &backlog, list) { mptcp_borrow_fwdmem(sk, skb); kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE); } sk_mem_reclaim(sk); } static void mptcp_do_fastclose(struct sock *sk) { struct mptcp_subflow_context *subflow, *tmp; struct mptcp_sock *msk = mptcp_sk(sk); mptcp_set_state(sk, TCP_CLOSE); mptcp_backlog_purge(sk); msk->fastclosing = 1; /* Explicitly send the fastclose reset as need */ if (__mptcp_check_fallback(msk)) return; mptcp_for_each_subflow_safe(msk, subflow, tmp) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); lock_sock(ssk); /* Some subflow socket states don't allow/need a reset.*/ if ((1 << ssk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) goto unlock; subflow->send_fastclose = 1; /* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0 * issue in __tcp_select_window(), see tcp_disconnect(). */ inet_csk(ssk)->icsk_ack.rcv_mss = TCP_MIN_MSS; tcp_send_active_reset(ssk, ssk->sk_allocation, SK_RST_REASON_TCP_ABORT_ON_CLOSE); unlock: release_sock(ssk); } } static void mptcp_worker(struct work_struct *work) { struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); struct sock *sk = (struct sock *)msk; unsigned long fail_tout; int state; lock_sock(sk); state = sk->sk_state; if (unlikely((1 << state) & (TCPF_CLOSE | TCPF_LISTEN))) goto unlock; mptcp_check_fastclose(msk); mptcp_pm_worker(msk); mptcp_check_send_data_fin(sk); mptcp_check_data_fin_ack(sk); mptcp_check_data_fin(sk); if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) __mptcp_close_subflow(sk); if (mptcp_close_tout_expired(sk)) { struct mptcp_subflow_context *subflow, *tmp; mptcp_do_fastclose(sk); mptcp_for_each_subflow_safe(msk, subflow, tmp) __mptcp_close_ssk(sk, subflow->tcp_sock, subflow, 0); mptcp_close_wake_up(sk); } if (sock_flag(sk, SOCK_DEAD) && sk->sk_state == TCP_CLOSE) { __mptcp_destroy_sock(sk); goto unlock; } if (test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags)) __mptcp_retrans(sk); fail_tout = msk->first ? READ_ONCE(mptcp_subflow_ctx(msk->first)->fail_tout) : 0; if (fail_tout && time_after(jiffies, fail_tout)) mptcp_mp_fail_no_response(msk); unlock: release_sock(sk); sock_put(sk); } static void __mptcp_init_sock(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); INIT_LIST_HEAD(&msk->conn_list); INIT_LIST_HEAD(&msk->join_list); INIT_LIST_HEAD(&msk->rtx_queue); INIT_LIST_HEAD(&msk->backlog_list); INIT_WORK(&msk->work, mptcp_worker); msk->out_of_order_queue = RB_ROOT; msk->first_pending = NULL; msk->timer_ival = TCP_RTO_MIN; msk->scaling_ratio = TCP_DEFAULT_SCALING_RATIO; msk->backlog_len = 0; WRITE_ONCE(msk->first, NULL); inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk))); msk->allow_infinite_fallback = true; msk->allow_subflows = true; msk->recovery = false; msk->subflow_id = 1; msk->last_data_sent = tcp_jiffies32; msk->last_data_recv = tcp_jiffies32; msk->last_ack_recv = tcp_jiffies32; mptcp_pm_data_init(msk); spin_lock_init(&msk->fallback_lock); /* re-use the csk retrans timer for MPTCP-level retrans */ timer_setup(&sk->mptcp_retransmit_timer, mptcp_retransmit_timer, 0); timer_setup(&msk->sk.mptcp_tout_timer, mptcp_tout_timer, 0); } static void mptcp_ca_reset(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); tcp_assign_congestion_control(sk); strscpy(mptcp_sk(sk)->ca_name, icsk->icsk_ca_ops->name, sizeof(mptcp_sk(sk)->ca_name)); /* no need to keep a reference to the ops, the name will suffice */ tcp_cleanup_congestion_control(sk); icsk->icsk_ca_ops = NULL; } static int mptcp_init_sock(struct sock *sk) { struct net *net = sock_net(sk); int ret; __mptcp_init_sock(sk); if (!mptcp_is_enabled(net)) return -ENOPROTOOPT; if (unlikely(!net->mib.mptcp_statistics) && !mptcp_mib_alloc(net)) return -ENOMEM; rcu_read_lock(); ret = mptcp_init_sched(mptcp_sk(sk), mptcp_sched_find(mptcp_get_scheduler(net))); rcu_read_unlock(); if (ret) return ret; set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags); /* fetch the ca name; do it outside __mptcp_init_sock(), so that clone will * propagate the correct value */ mptcp_ca_reset(sk); sk_sockets_allocated_inc(sk); sk->sk_rcvbuf = READ_ONCE(net->ipv4.sysctl_tcp_rmem[1]); sk->sk_sndbuf = READ_ONCE(net->ipv4.sysctl_tcp_wmem[1]); return 0; } static void __mptcp_clear_xmit(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_data_frag *dtmp, *dfrag; msk->first_pending = NULL; list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) dfrag_clear(sk, dfrag); } void mptcp_cancel_work(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); if (cancel_work_sync(&msk->work)) __sock_put(sk); } void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how) { lock_sock(ssk); switch (ssk->sk_state) { case TCP_LISTEN: if (!(how & RCV_SHUTDOWN)) break; fallthrough; case TCP_SYN_SENT: WARN_ON_ONCE(tcp_disconnect(ssk, O_NONBLOCK)); break; default: if (__mptcp_check_fallback(mptcp_sk(sk))) { pr_debug("Fallback\n"); ssk->sk_shutdown |= how; tcp_shutdown(ssk, how); /* simulate the data_fin ack reception to let the state * machine move forward */ WRITE_ONCE(mptcp_sk(sk)->snd_una, mptcp_sk(sk)->snd_nxt); mptcp_schedule_work(sk); } else { pr_debug("Sending DATA_FIN on subflow %p\n", ssk); tcp_send_ack(ssk); if (!mptcp_rtx_timer_pending(sk)) mptcp_reset_rtx_timer(sk); } break; } release_sock(ssk); } void mptcp_set_state(struct sock *sk, int state) { int oldstate = sk->sk_state; switch (state) { case TCP_ESTABLISHED: if (oldstate != TCP_ESTABLISHED) MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_CURRESTAB); break; case TCP_CLOSE_WAIT: /* Unlike TCP, MPTCP sk would not have the TCP_SYN_RECV state: * MPTCP "accepted" sockets will be created later on. So no * transition from TCP_SYN_RECV to TCP_CLOSE_WAIT. */ break; default: if (oldstate == TCP_ESTABLISHED || oldstate == TCP_CLOSE_WAIT) MPTCP_DEC_STATS(sock_net(sk), MPTCP_MIB_CURRESTAB); } inet_sk_state_store(sk, state); } static const unsigned char new_state[16] = { /* current state: new state: action: */ [0 /* (Invalid) */] = TCP_CLOSE, [TCP_ESTABLISHED] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, [TCP_SYN_SENT] = TCP_CLOSE, [TCP_SYN_RECV] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, [TCP_FIN_WAIT1] = TCP_FIN_WAIT1, [TCP_FIN_WAIT2] = TCP_FIN_WAIT2, [TCP_TIME_WAIT] = TCP_CLOSE, /* should not happen ! */ [TCP_CLOSE] = TCP_CLOSE, [TCP_CLOSE_WAIT] = TCP_LAST_ACK | TCP_ACTION_FIN, [TCP_LAST_ACK] = TCP_LAST_ACK, [TCP_LISTEN] = TCP_CLOSE, [TCP_CLOSING] = TCP_CLOSING, [TCP_NEW_SYN_RECV] = TCP_CLOSE, /* should not happen ! */ }; static int mptcp_close_state(struct sock *sk) { int next = (int)new_state[sk->sk_state]; int ns = next & TCP_STATE_MASK; mptcp_set_state(sk, ns); return next & TCP_ACTION_FIN; } static void mptcp_check_send_data_fin(struct sock *sk) { struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); pr_debug("msk=%p snd_data_fin_enable=%d pending=%d snd_nxt=%llu write_seq=%llu\n", msk, msk->snd_data_fin_enable, !!mptcp_send_head(sk), msk->snd_nxt, msk->write_seq); /* we still need to enqueue subflows or not really shutting down, * skip this */ if (!msk->snd_data_fin_enable || msk->snd_nxt + 1 != msk->write_seq || mptcp_send_head(sk)) return; WRITE_ONCE(msk->snd_nxt, msk->write_seq); mptcp_for_each_subflow(msk, subflow) { struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); mptcp_subflow_shutdown(sk, tcp_sk, SEND_SHUTDOWN); } } static void __mptcp_wr_shutdown(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); pr_debug("msk=%p snd_data_fin_enable=%d shutdown=%x state=%d pending=%d\n", msk, msk->snd_data_fin_enable, sk->sk_shutdown, sk->sk_state, !!mptcp_send_head(sk)); /* will be ignored by fallback sockets */ WRITE_ONCE(msk->write_seq, msk->write_seq + 1); WRITE_ONCE(msk->snd_data_fin_enable, 1); mptcp_check_send_data_fin(sk); } static void __mptcp_destroy_sock(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); pr_debug("msk=%p\n", msk); might_sleep(); mptcp_stop_rtx_timer(sk); sk_stop_timer(sk, &inet_csk(sk)->mptcp_tout_timer); msk->pm.status = 0; mptcp_release_sched(msk); sk->sk_prot->destroy(sk); sk_stream_kill_queues(sk); xfrm_sk_free_policy(sk); sock_put(sk); } void __mptcp_unaccepted_force_close(struct sock *sk) { sock_set_flag(sk, SOCK_DEAD); mptcp_do_fastclose(sk); __mptcp_destroy_sock(sk); } static __poll_t mptcp_check_readable(struct sock *sk) { return mptcp_epollin_ready(sk) ? EPOLLIN | EPOLLRDNORM : 0; } static void mptcp_check_listen_stop(struct sock *sk) { struct sock *ssk; if (inet_sk_state_load(sk) != TCP_LISTEN) return; sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); ssk = mptcp_sk(sk)->first; if (WARN_ON_ONCE(!ssk || inet_sk_state_load(ssk) != TCP_LISTEN)) return; lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); tcp_set_state(ssk, TCP_CLOSE); mptcp_subflow_queue_clean(sk, ssk); inet_csk_listen_stop(ssk); mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CLOSED); release_sock(ssk); } bool __mptcp_close(struct sock *sk, long timeout) { struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); bool do_cancel_work = false; int subflows_alive = 0; WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) { mptcp_check_listen_stop(sk); mptcp_set_state(sk, TCP_CLOSE); goto cleanup; } if (mptcp_data_avail(msk) || timeout < 0) { /* If the msk has read data, or the caller explicitly ask it, * do the MPTCP equivalent of TCP reset, aka MPTCP fastclose */ mptcp_do_fastclose(sk); timeout = 0; } else if (mptcp_close_state(sk)) { __mptcp_wr_shutdown(sk); } sk_stream_wait_close(sk, timeout); cleanup: /* orphan all the subflows */ mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bool slow = lock_sock_fast_nested(ssk); subflows_alive += ssk->sk_state != TCP_CLOSE; /* since the close timeout takes precedence on the fail one, * cancel the latter */ if (ssk == msk->first) subflow->fail_tout = 0; /* detach from the parent socket, but allow data_ready to * push incoming data into the mptcp stack, to properly ack it */ ssk->sk_socket = NULL; ssk->sk_wq = NULL; unlock_sock_fast(ssk, slow); } sock_orphan(sk); /* all the subflows are closed, only timeout can change the msk * state, let's not keep resources busy for no reasons */ if (subflows_alive == 0) mptcp_set_state(sk, TCP_CLOSE); sock_hold(sk); pr_debug("msk=%p state=%d\n", sk, sk->sk_state); mptcp_pm_connection_closed(msk); if (sk->sk_state == TCP_CLOSE) { __mptcp_destroy_sock(sk); do_cancel_work = true; } else { mptcp_start_tout_timer(sk); } return do_cancel_work; } static void mptcp_close(struct sock *sk, long timeout) { bool do_cancel_work; lock_sock(sk); do_cancel_work = __mptcp_close(sk, timeout); release_sock(sk); if (do_cancel_work) mptcp_cancel_work(sk); sock_put(sk); } static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk) { #if IS_ENABLED(CONFIG_MPTCP_IPV6) const struct ipv6_pinfo *ssk6 = inet6_sk(ssk); struct ipv6_pinfo *msk6 = inet6_sk(msk); msk->sk_v6_daddr = ssk->sk_v6_daddr; msk->sk_v6_rcv_saddr = ssk->sk_v6_rcv_saddr; if (msk6 && ssk6) { msk6->saddr = ssk6->saddr; msk6->flow_label = ssk6->flow_label; } #endif inet_sk(msk)->inet_num = inet_sk(ssk)->inet_num; inet_sk(msk)->inet_dport = inet_sk(ssk)->inet_dport; inet_sk(msk)->inet_sport = inet_sk(ssk)->inet_sport; inet_sk(msk)->inet_daddr = inet_sk(ssk)->inet_daddr; inet_sk(msk)->inet_saddr = inet_sk(ssk)->inet_saddr; inet_sk(msk)->inet_rcv_saddr = inet_sk(ssk)->inet_rcv_saddr; } static void mptcp_destroy_common(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow, *tmp; struct sock *sk = (struct sock *)msk; __mptcp_clear_xmit(sk); mptcp_backlog_purge(sk); /* join list will be eventually flushed (with rst) at sock lock release time */ mptcp_for_each_subflow_safe(msk, subflow, tmp) __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow), subflow, 0); __skb_queue_purge(&sk->sk_receive_queue); skb_rbtree_purge(&msk->out_of_order_queue); /* move all the rx fwd alloc into the sk_mem_reclaim_final in * inet_sock_destruct() will dispose it */ mptcp_token_destroy(msk); mptcp_pm_destroy(msk); } static int mptcp_disconnect(struct sock *sk, int flags) { struct mptcp_sock *msk = mptcp_sk(sk); /* We are on the fastopen error path. We can't call straight into the * subflows cleanup code due to lock nesting (we are already under * msk->firstsocket lock). */ if (msk->fastopening) return -EBUSY; mptcp_check_listen_stop(sk); mptcp_set_state(sk, TCP_CLOSE); mptcp_stop_rtx_timer(sk); mptcp_stop_tout_timer(sk); mptcp_pm_connection_closed(msk); /* msk->subflow is still intact, the following will not free the first * subflow */ mptcp_do_fastclose(sk); mptcp_destroy_common(msk); /* The first subflow is already in TCP_CLOSE status, the following * can't overlap with a fallback anymore */ spin_lock_bh(&msk->fallback_lock); msk->allow_subflows = true; msk->allow_infinite_fallback = true; WRITE_ONCE(msk->flags, 0); spin_unlock_bh(&msk->fallback_lock); msk->cb_flags = 0; msk->recovery = false; WRITE_ONCE(msk->can_ack, false); WRITE_ONCE(msk->fully_established, false); WRITE_ONCE(msk->rcv_data_fin, false); WRITE_ONCE(msk->snd_data_fin_enable, false); WRITE_ONCE(msk->rcv_fastclose, false); WRITE_ONCE(msk->use_64bit_ack, false); WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk))); mptcp_pm_data_reset(msk); mptcp_ca_reset(sk); msk->bytes_consumed = 0; msk->bytes_acked = 0; msk->bytes_received = 0; msk->bytes_sent = 0; msk->bytes_retrans = 0; msk->rcvspace_init = 0; msk->fastclosing = 0; /* for fallback's sake */ WRITE_ONCE(msk->ack_seq, 0); WRITE_ONCE(sk->sk_shutdown, 0); sk_error_report(sk); return 0; } #if IS_ENABLED(CONFIG_MPTCP_IPV6) static struct ipv6_pinfo *mptcp_inet6_sk(const struct sock *sk) { struct mptcp6_sock *msk6 = container_of(mptcp_sk(sk), struct mptcp6_sock, msk); return &msk6->np; } static void mptcp_copy_ip6_options(struct sock *newsk, const struct sock *sk) { const struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_txoptions *opt; struct ipv6_pinfo *newnp; newnp = inet6_sk(newsk); rcu_read_lock(); opt = rcu_dereference(np->opt); if (opt) { opt = ipv6_dup_options(newsk, opt); if (!opt) net_warn_ratelimited("%s: Failed to copy ip6 options\n", __func__); } RCU_INIT_POINTER(newnp->opt, opt); rcu_read_unlock(); } #endif static void mptcp_copy_ip_options(struct sock *newsk, const struct sock *sk) { struct ip_options_rcu *inet_opt, *newopt = NULL; const struct inet_sock *inet = inet_sk(sk); struct inet_sock *newinet; newinet = inet_sk(newsk); rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt) { newopt = sock_kmemdup(newsk, inet_opt, sizeof(*inet_opt) + inet_opt->opt.optlen, GFP_ATOMIC); if (!newopt) net_warn_ratelimited("%s: Failed to copy ip options\n", __func__); } RCU_INIT_POINTER(newinet->inet_opt, newopt); rcu_read_unlock(); } struct sock *mptcp_sk_clone_init(const struct sock *sk, const struct mptcp_options_received *mp_opt, struct sock *ssk, struct request_sock *req) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); struct sock *nsk = sk_clone_lock(sk, GFP_ATOMIC); struct mptcp_subflow_context *subflow; struct mptcp_sock *msk; if (!nsk) return NULL; #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (nsk->sk_family == AF_INET6) inet_sk(nsk)->pinet6 = mptcp_inet6_sk(nsk); #endif __mptcp_init_sock(nsk); #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (nsk->sk_family == AF_INET6) mptcp_copy_ip6_options(nsk, sk); else #endif mptcp_copy_ip_options(nsk, sk); msk = mptcp_sk(nsk); WRITE_ONCE(msk->local_key, subflow_req->local_key); WRITE_ONCE(msk->token, subflow_req->token); msk->in_accept_queue = 1; WRITE_ONCE(msk->fully_established, false); if (mp_opt->suboptions & OPTION_MPTCP_CSUMREQD) WRITE_ONCE(msk->csum_enabled, true); WRITE_ONCE(msk->write_seq, subflow_req->idsn + 1); WRITE_ONCE(msk->snd_nxt, msk->write_seq); WRITE_ONCE(msk->snd_una, msk->write_seq); WRITE_ONCE(msk->wnd_end, msk->snd_nxt + tcp_sk(ssk)->snd_wnd); msk->setsockopt_seq = mptcp_sk(sk)->setsockopt_seq; mptcp_init_sched(msk, mptcp_sk(sk)->sched); /* passive msk is created after the first/MPC subflow */ msk->subflow_id = 2; sock_reset_flag(nsk, SOCK_RCU_FREE); security_inet_csk_clone(nsk, req); /* this can't race with mptcp_close(), as the msk is * not yet exposted to user-space */ mptcp_set_state(nsk, TCP_ESTABLISHED); /* The msk maintain a ref to each subflow in the connections list */ WRITE_ONCE(msk->first, ssk); subflow = mptcp_subflow_ctx(ssk); list_add(&subflow->node, &msk->conn_list); sock_hold(ssk); /* new mpc subflow takes ownership of the newly * created mptcp socket */ mptcp_token_accept(subflow_req, msk); /* set msk addresses early to ensure mptcp_pm_get_local_id() * uses the correct data */ mptcp_copy_inaddrs(nsk, ssk); __mptcp_propagate_sndbuf(nsk, ssk); mptcp_rcv_space_init(msk, ssk); if (mp_opt->suboptions & OPTION_MPTCP_MPC_ACK) __mptcp_subflow_fully_established(msk, subflow, mp_opt); bh_unlock_sock(nsk); /* note: the newly allocated socket refcount is 2 now */ return nsk; } void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk) { const struct tcp_sock *tp = tcp_sk(ssk); msk->rcvspace_init = 1; msk->rcvq_space.copied = 0; msk->rcvq_space.rtt_us = 0; msk->rcvq_space.time = tp->tcp_mstamp; /* initial rcv_space offering made to peer */ msk->rcvq_space.space = min_t(u32, tp->rcv_wnd, TCP_INIT_CWND * tp->advmss); if (msk->rcvq_space.space == 0) msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT; } static void mptcp_destroy(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); /* allow the following to close even the initial subflow */ msk->free_first = 1; mptcp_destroy_common(msk); sk_sockets_allocated_dec(sk); } void __mptcp_data_acked(struct sock *sk) { if (!sock_owned_by_user(sk)) __mptcp_clean_una(sk); else __set_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->cb_flags); } void __mptcp_check_push(struct sock *sk, struct sock *ssk) { if (!sock_owned_by_user(sk)) __mptcp_subflow_push_pending(sk, ssk, false); else __set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->cb_flags); } #define MPTCP_FLAGS_PROCESS_CTX_NEED (BIT(MPTCP_PUSH_PENDING) | \ BIT(MPTCP_RETRANSMIT) | \ BIT(MPTCP_FLUSH_JOIN_LIST)) /* processes deferred events and flush wmem */ static void mptcp_release_cb(struct sock *sk) __must_hold(&sk->sk_lock.slock) { struct mptcp_sock *msk = mptcp_sk(sk); for (;;) { unsigned long flags = (msk->cb_flags & MPTCP_FLAGS_PROCESS_CTX_NEED); struct list_head join_list, skbs; bool spool_bl; u32 moved; spool_bl = mptcp_can_spool_backlog(sk, &skbs); if (!flags && !spool_bl) break; INIT_LIST_HEAD(&join_list); list_splice_init(&msk->join_list, &join_list); /* the following actions acquire the subflow socket lock * * 1) can't be invoked in atomic scope * 2) must avoid ABBA deadlock with msk socket spinlock: the RX * datapath acquires the msk socket spinlock while helding * the subflow socket lock */ msk->cb_flags &= ~flags; spin_unlock_bh(&sk->sk_lock.slock); if (flags & BIT(MPTCP_FLUSH_JOIN_LIST)) __mptcp_flush_join_list(sk, &join_list); if (flags & BIT(MPTCP_PUSH_PENDING)) __mptcp_push_pending(sk, 0); if (flags & BIT(MPTCP_RETRANSMIT)) __mptcp_retrans(sk); if (spool_bl && __mptcp_move_skbs(sk, &skbs, &moved)) { /* notify ack seq update */ mptcp_cleanup_rbuf(msk, 0); sk->sk_data_ready(sk); } cond_resched(); spin_lock_bh(&sk->sk_lock.slock); if (spool_bl) mptcp_backlog_spooled(sk, moved, &skbs); } if (__test_and_clear_bit(MPTCP_CLEAN_UNA, &msk->cb_flags)) __mptcp_clean_una_wakeup(sk); if (unlikely(msk->cb_flags)) { /* be sure to sync the msk state before taking actions * depending on sk_state (MPTCP_ERROR_REPORT) * On sk release avoid actions depending on the first subflow */ if (__test_and_clear_bit(MPTCP_SYNC_STATE, &msk->cb_flags) && msk->first) __mptcp_sync_state(sk, msk->pending_state); if (__test_and_clear_bit(MPTCP_ERROR_REPORT, &msk->cb_flags)) __mptcp_error_report(sk); if (__test_and_clear_bit(MPTCP_SYNC_SNDBUF, &msk->cb_flags)) __mptcp_sync_sndbuf(sk); } } /* MP_JOIN client subflow must wait for 4th ack before sending any data: * TCP can't schedule delack timer before the subflow is fully established. * MPTCP uses the delack timer to do 3rd ack retransmissions */ static void schedule_3rdack_retransmission(struct sock *ssk) { struct inet_connection_sock *icsk = inet_csk(ssk); struct tcp_sock *tp = tcp_sk(ssk); unsigned long timeout; if (READ_ONCE(mptcp_subflow_ctx(ssk)->fully_established)) return; /* reschedule with a timeout above RTT, as we must look only for drop */ if (tp->srtt_us) timeout = usecs_to_jiffies(tp->srtt_us >> (3 - 1)); else timeout = TCP_TIMEOUT_INIT; timeout += jiffies; WARN_ON_ONCE(icsk->icsk_ack.pending & ICSK_ACK_TIMER); smp_store_release(&icsk->icsk_ack.pending, icsk->icsk_ack.pending | ICSK_ACK_SCHED | ICSK_ACK_TIMER); sk_reset_timer(ssk, &icsk->icsk_delack_timer, timeout); } void mptcp_subflow_process_delegated(struct sock *ssk, long status) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = subflow->conn; if (status & BIT(MPTCP_DELEGATE_SEND)) { mptcp_data_lock(sk); if (!sock_owned_by_user(sk)) __mptcp_subflow_push_pending(sk, ssk, true); else __set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->cb_flags); mptcp_data_unlock(sk); } if (status & BIT(MPTCP_DELEGATE_SNDBUF)) { mptcp_data_lock(sk); if (!sock_owned_by_user(sk)) __mptcp_sync_sndbuf(sk); else __set_bit(MPTCP_SYNC_SNDBUF, &mptcp_sk(sk)->cb_flags); mptcp_data_unlock(sk); } if (status & BIT(MPTCP_DELEGATE_ACK)) schedule_3rdack_retransmission(ssk); } static int mptcp_hash(struct sock *sk) { /* should never be called, * we hash the TCP subflows not the MPTCP socket */ WARN_ON_ONCE(1); return 0; } static void mptcp_unhash(struct sock *sk) { /* called from sk_common_release(), but nothing to do here */ } static int mptcp_get_port(struct sock *sk, unsigned short snum) { struct mptcp_sock *msk = mptcp_sk(sk); pr_debug("msk=%p, ssk=%p\n", msk, msk->first); if (WARN_ON_ONCE(!msk->first)) return -EINVAL; return inet_csk_get_port(msk->first, snum); } void mptcp_finish_connect(struct sock *ssk) { struct mptcp_subflow_context *subflow; struct mptcp_sock *msk; struct sock *sk; subflow = mptcp_subflow_ctx(ssk); sk = subflow->conn; msk = mptcp_sk(sk); pr_debug("msk=%p, token=%u\n", sk, subflow->token); subflow->map_seq = subflow->iasn; subflow->map_subflow_seq = 1; /* the socket is not connected yet, no msk/subflow ops can access/race * accessing the field below */ WRITE_ONCE(msk->local_key, subflow->local_key); mptcp_pm_new_connection(msk, ssk, 0); } void mptcp_sock_graft(struct sock *sk, struct socket *parent) { write_lock_bh(&sk->sk_callback_lock); rcu_assign_pointer(sk->sk_wq, &parent->wq); sk_set_socket(sk, parent); write_unlock_bh(&sk->sk_callback_lock); } /* Can be called without holding the msk socket lock; use the callback lock * to avoid {READ_,WRITE_}ONCE annotations on sk_socket. */ static void mptcp_sock_check_graft(struct sock *sk, struct sock *ssk) { struct socket *sock; write_lock_bh(&sk->sk_callback_lock); sock = sk->sk_socket; write_unlock_bh(&sk->sk_callback_lock); if (sock) { mptcp_sock_graft(ssk, sock); __mptcp_inherit_cgrp_data(sk, ssk); __mptcp_inherit_memcg(sk, ssk, GFP_ATOMIC); } } bool mptcp_finish_join(struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); struct sock *parent = (void *)msk; bool ret = true; pr_debug("msk=%p, subflow=%p\n", msk, subflow); /* mptcp socket already closing? */ if (!mptcp_is_fully_established(parent)) { subflow->reset_reason = MPTCP_RST_EMPTCP; return false; } /* Active subflow, already present inside the conn_list; is grafted * either by __mptcp_subflow_connect() or accept. */ if (!list_empty(&subflow->node)) { spin_lock_bh(&msk->fallback_lock); if (!msk->allow_subflows) { spin_unlock_bh(&msk->fallback_lock); return false; } mptcp_subflow_joined(msk, ssk); spin_unlock_bh(&msk->fallback_lock); mptcp_propagate_sndbuf(parent, ssk); return true; } if (!mptcp_pm_allow_new_subflow(msk)) { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_JOINREJECTED); goto err_prohibited; } /* If we can't acquire msk socket lock here, let the release callback * handle it */ mptcp_data_lock(parent); if (!sock_owned_by_user(parent)) { ret = __mptcp_finish_join(msk, ssk); if (ret) { sock_hold(ssk); list_add_tail(&subflow->node, &msk->conn_list); mptcp_sock_check_graft(parent, ssk); } } else { sock_hold(ssk); list_add_tail(&subflow->node, &msk->join_list); __set_bit(MPTCP_FLUSH_JOIN_LIST, &msk->cb_flags); /* In case of later failures, __mptcp_flush_join_list() will * properly orphan the ssk via mptcp_close_ssk(). */ mptcp_sock_check_graft(parent, ssk); } mptcp_data_unlock(parent); if (!ret) { err_prohibited: subflow->reset_reason = MPTCP_RST_EPROHIBIT; return false; } return true; } static void mptcp_shutdown(struct sock *sk, int how) { pr_debug("sk=%p, how=%d\n", sk, how); if ((how & SEND_SHUTDOWN) && mptcp_close_state(sk)) __mptcp_wr_shutdown(sk); } static int mptcp_ioctl_outq(const struct mptcp_sock *msk, u64 v) { const struct sock *sk = (void *)msk; u64 delta; if (sk->sk_state == TCP_LISTEN) return -EINVAL; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) return 0; delta = msk->write_seq - v; if (__mptcp_check_fallback(msk) && msk->first) { struct tcp_sock *tp = tcp_sk(msk->first); /* the first subflow is disconnected after close - see * __mptcp_close_ssk(). tcp_disconnect() moves the write_seq * so ignore that status, too. */ if (!((1 << msk->first->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE))) delta += READ_ONCE(tp->write_seq) - tp->snd_una; } if (delta > INT_MAX) delta = INT_MAX; return (int)delta; } static int mptcp_ioctl(struct sock *sk, int cmd, int *karg) { struct mptcp_sock *msk = mptcp_sk(sk); bool slow; switch (cmd) { case SIOCINQ: if (sk->sk_state == TCP_LISTEN) return -EINVAL; lock_sock(sk); if (mptcp_move_skbs(sk)) mptcp_cleanup_rbuf(msk, 0); *karg = mptcp_inq_hint(sk); release_sock(sk); break; case SIOCOUTQ: slow = lock_sock_fast(sk); *karg = mptcp_ioctl_outq(msk, READ_ONCE(msk->snd_una)); unlock_sock_fast(sk, slow); break; case SIOCOUTQNSD: slow = lock_sock_fast(sk); *karg = mptcp_ioctl_outq(msk, msk->snd_nxt); unlock_sock_fast(sk, slow); break; default: return -ENOIOCTLCMD; } return 0; } static int mptcp_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); int err = -EINVAL; struct sock *ssk; ssk = __mptcp_nmpc_sk(msk); if (IS_ERR(ssk)) return PTR_ERR(ssk); mptcp_set_state(sk, TCP_SYN_SENT); subflow = mptcp_subflow_ctx(ssk); #ifdef CONFIG_TCP_MD5SIG /* no MPTCP if MD5SIG is enabled on this socket or we may run out of * TCP option space. */ if (rcu_access_pointer(tcp_sk(ssk)->md5sig_info)) mptcp_early_fallback(msk, subflow, MPTCP_MIB_MD5SIGFALLBACK); #endif if (subflow->request_mptcp) { if (mptcp_active_should_disable(sk)) mptcp_early_fallback(msk, subflow, MPTCP_MIB_MPCAPABLEACTIVEDISABLED); else if (mptcp_token_new_connect(ssk) < 0) mptcp_early_fallback(msk, subflow, MPTCP_MIB_TOKENFALLBACKINIT); } WRITE_ONCE(msk->write_seq, subflow->idsn); WRITE_ONCE(msk->snd_nxt, subflow->idsn); WRITE_ONCE(msk->snd_una, subflow->idsn); if (likely(!__mptcp_check_fallback(msk))) MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVE); /* if reaching here via the fastopen/sendmsg path, the caller already * acquired the subflow socket lock, too. */ if (!msk->fastopening) lock_sock(ssk); /* the following mirrors closely a very small chunk of code from * __inet_stream_connect() */ if (ssk->sk_state != TCP_CLOSE) goto out; if (BPF_CGROUP_PRE_CONNECT_ENABLED(ssk)) { err = ssk->sk_prot->pre_connect(ssk, uaddr, addr_len); if (err) goto out; } err = ssk->sk_prot->connect(ssk, uaddr, addr_len); if (err < 0) goto out; inet_assign_bit(DEFER_CONNECT, sk, inet_test_bit(DEFER_CONNECT, ssk)); out: if (!msk->fastopening) release_sock(ssk); /* on successful connect, the msk state will be moved to established by * subflow_finish_connect() */ if (unlikely(err)) { /* avoid leaving a dangling token in an unconnected socket */ mptcp_token_destroy(msk); mptcp_set_state(sk, TCP_CLOSE); return err; } mptcp_copy_inaddrs(sk, ssk); return 0; } static struct proto mptcp_prot = { .name = "MPTCP", .owner = THIS_MODULE, .init = mptcp_init_sock, .connect = mptcp_connect, .disconnect = mptcp_disconnect, .close = mptcp_close, .setsockopt = mptcp_setsockopt, .getsockopt = mptcp_getsockopt, .shutdown = mptcp_shutdown, .destroy = mptcp_destroy, .sendmsg = mptcp_sendmsg, .ioctl = mptcp_ioctl, .recvmsg = mptcp_recvmsg, .release_cb = mptcp_release_cb, .hash = mptcp_hash, .unhash = mptcp_unhash, .get_port = mptcp_get_port, .stream_memory_free = mptcp_stream_memory_free, .sockets_allocated = &mptcp_sockets_allocated, .memory_allocated = &net_aligned_data.tcp_memory_allocated, .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, .memory_pressure = &tcp_memory_pressure, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), .sysctl_mem = sysctl_tcp_mem, .obj_size = sizeof(struct mptcp_sock), .slab_flags = SLAB_TYPESAFE_BY_RCU, .no_autobind = true, }; static int mptcp_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { struct mptcp_sock *msk = mptcp_sk(sock->sk); struct sock *ssk, *sk = sock->sk; int err = -EINVAL; lock_sock(sk); ssk = __mptcp_nmpc_sk(msk); if (IS_ERR(ssk)) { err = PTR_ERR(ssk); goto unlock; } if (sk->sk_family == AF_INET) err = inet_bind_sk(ssk, uaddr, addr_len); #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (sk->sk_family == AF_INET6) err = inet6_bind_sk(ssk, uaddr, addr_len); #endif if (!err) mptcp_copy_inaddrs(sk, ssk); unlock: release_sock(sk); return err; } static int mptcp_listen(struct socket *sock, int backlog) { struct mptcp_sock *msk = mptcp_sk(sock->sk); struct sock *sk = sock->sk; struct sock *ssk; int err; pr_debug("msk=%p\n", msk); lock_sock(sk); err = -EINVAL; if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM) goto unlock; ssk = __mptcp_nmpc_sk(msk); if (IS_ERR(ssk)) { err = PTR_ERR(ssk); goto unlock; } mptcp_set_state(sk, TCP_LISTEN); sock_set_flag(sk, SOCK_RCU_FREE); lock_sock(ssk); err = __inet_listen_sk(ssk, backlog); release_sock(ssk); mptcp_set_state(sk, inet_sk_state_load(ssk)); if (!err) { sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); mptcp_copy_inaddrs(sk, ssk); mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CREATED); } unlock: release_sock(sk); return err; } static void mptcp_graft_subflows(struct sock *sk) { struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); if (mem_cgroup_sockets_enabled) { LIST_HEAD(join_list); /* Subflows joining after __inet_accept() will get the * mem CG properly initialized at mptcp_finish_join() time, * but subflows pending in join_list need explicit * initialization before flushing `backlog_unaccounted` * or MPTCP can later unexpectedly observe unaccounted memory. */ mptcp_data_lock(sk); list_splice_init(&msk->join_list, &join_list); mptcp_data_unlock(sk); __mptcp_flush_join_list(sk, &join_list); } mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); lock_sock(ssk); /* Set ssk->sk_socket of accept()ed flows to mptcp socket. * This is needed so NOSPACE flag can be set from tcp stack. */ if (!ssk->sk_socket) mptcp_sock_graft(ssk, sk->sk_socket); if (!mem_cgroup_sk_enabled(sk)) goto unlock; __mptcp_inherit_cgrp_data(sk, ssk); __mptcp_inherit_memcg(sk, ssk, GFP_KERNEL); unlock: release_sock(ssk); } if (mem_cgroup_sk_enabled(sk)) { gfp_t gfp = GFP_KERNEL | __GFP_NOFAIL; int amt; /* Account the backlog memory; prior accept() is aware of * fwd and rmem only. */ mptcp_data_lock(sk); amt = sk_mem_pages(sk->sk_forward_alloc + msk->backlog_unaccounted + atomic_read(&sk->sk_rmem_alloc)) - sk_mem_pages(sk->sk_forward_alloc + atomic_read(&sk->sk_rmem_alloc)); msk->backlog_unaccounted = 0; mptcp_data_unlock(sk); if (amt) mem_cgroup_sk_charge(sk, amt, gfp); } } static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg) { struct mptcp_sock *msk = mptcp_sk(sock->sk); struct sock *ssk, *newsk; pr_debug("msk=%p\n", msk); /* Buggy applications can call accept on socket states other then LISTEN * but no need to allocate the first subflow just to error out. */ ssk = READ_ONCE(msk->first); if (!ssk) return -EINVAL; pr_debug("ssk=%p, listener=%p\n", ssk, mptcp_subflow_ctx(ssk)); newsk = inet_csk_accept(ssk, arg); if (!newsk) return arg->err; pr_debug("newsk=%p, subflow is mptcp=%d\n", newsk, sk_is_mptcp(newsk)); if (sk_is_mptcp(newsk)) { struct mptcp_subflow_context *subflow; struct sock *new_mptcp_sock; subflow = mptcp_subflow_ctx(newsk); new_mptcp_sock = subflow->conn; /* is_mptcp should be false if subflow->conn is missing, see * subflow_syn_recv_sock() */ if (WARN_ON_ONCE(!new_mptcp_sock)) { tcp_sk(newsk)->is_mptcp = 0; goto tcpfallback; } newsk = new_mptcp_sock; MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPCAPABLEPASSIVEACK); newsk->sk_kern_sock = arg->kern; lock_sock(newsk); __inet_accept(sock, newsock, newsk); set_bit(SOCK_CUSTOM_SOCKOPT, &newsock->flags); msk = mptcp_sk(newsk); msk->in_accept_queue = 0; mptcp_graft_subflows(newsk); mptcp_rps_record_subflows(msk); /* Do late cleanup for the first subflow as necessary. Also * deal with bad peers not doing a complete shutdown. */ if (unlikely(inet_sk_state_load(msk->first) == TCP_CLOSE)) { if (unlikely(list_is_singular(&msk->conn_list))) mptcp_set_state(newsk, TCP_CLOSE); mptcp_close_ssk(newsk, msk->first, mptcp_subflow_ctx(msk->first)); } } else { tcpfallback: newsk->sk_kern_sock = arg->kern; lock_sock(newsk); __inet_accept(sock, newsock, newsk); /* we are being invoked after accepting a non-mp-capable * flow: sk is a tcp_sk, not an mptcp one. * * Hand the socket over to tcp so all further socket ops * bypass mptcp. */ WRITE_ONCE(newsock->sk->sk_socket->ops, mptcp_fallback_tcp_ops(newsock->sk)); } release_sock(newsk); return 0; } static __poll_t mptcp_check_writeable(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; if (__mptcp_stream_is_writeable(sk, 1)) return EPOLLOUT | EPOLLWRNORM; set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); smp_mb__after_atomic(); /* NOSPACE is changed by mptcp_write_space() */ if (__mptcp_stream_is_writeable(sk, 1)) return EPOLLOUT | EPOLLWRNORM; return 0; } static __poll_t mptcp_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait) { struct sock *sk = sock->sk; struct mptcp_sock *msk; __poll_t mask = 0; u8 shutdown; int state; msk = mptcp_sk(sk); sock_poll_wait(file, sock, wait); state = inet_sk_state_load(sk); pr_debug("msk=%p state=%d flags=%lx\n", msk, state, msk->flags); if (state == TCP_LISTEN) { struct sock *ssk = READ_ONCE(msk->first); if (WARN_ON_ONCE(!ssk)) return 0; return inet_csk_listen_poll(ssk); } shutdown = READ_ONCE(sk->sk_shutdown); if (shutdown == SHUTDOWN_MASK || state == TCP_CLOSE) mask |= EPOLLHUP; if (shutdown & RCV_SHUTDOWN) mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) { mask |= mptcp_check_readable(sk); if (shutdown & SEND_SHUTDOWN) mask |= EPOLLOUT | EPOLLWRNORM; else mask |= mptcp_check_writeable(msk); } else if (state == TCP_SYN_SENT && inet_test_bit(DEFER_CONNECT, sk)) { /* cf tcp_poll() note about TFO */ mask |= EPOLLOUT | EPOLLWRNORM; } /* This barrier is coupled with smp_wmb() in __mptcp_error_report() */ smp_rmb(); if (READ_ONCE(sk->sk_err)) mask |= EPOLLERR; return mask; } static const struct proto_ops mptcp_stream_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, .bind = mptcp_bind, .connect = inet_stream_connect, .socketpair = sock_no_socketpair, .accept = mptcp_stream_accept, .getname = inet_getname, .poll = mptcp_poll, .ioctl = inet_ioctl, .gettstamp = sock_gettstamp, .listen = mptcp_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, .set_rcvlowat = mptcp_set_rcvlowat, }; static struct inet_protosw mptcp_protosw = { .type = SOCK_STREAM, .protocol = IPPROTO_MPTCP, .prot = &mptcp_prot, .ops = &mptcp_stream_ops, .flags = INET_PROTOSW_ICSK, }; static int mptcp_napi_poll(struct napi_struct *napi, int budget) { struct mptcp_delegated_action *delegated; struct mptcp_subflow_context *subflow; int work_done = 0; delegated = container_of(napi, struct mptcp_delegated_action, napi); while ((subflow = mptcp_subflow_delegated_next(delegated)) != NULL) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bh_lock_sock_nested(ssk); if (!sock_owned_by_user(ssk)) { mptcp_subflow_process_delegated(ssk, xchg(&subflow->delegated_status, 0)); } else { /* tcp_release_cb_override already processed * the action or will do at next release_sock(). * In both case must dequeue the subflow here - on the same * CPU that scheduled it. */ smp_wmb(); clear_bit(MPTCP_DELEGATE_SCHEDULED, &subflow->delegated_status); } bh_unlock_sock(ssk); sock_put(ssk); if (++work_done == budget) return budget; } /* always provide a 0 'work_done' argument, so that napi_complete_done * will not try accessing the NULL napi->dev ptr */ napi_complete_done(napi, 0); return work_done; } void __init mptcp_proto_init(void) { struct mptcp_delegated_action *delegated; int cpu; mptcp_prot.h.hashinfo = tcp_prot.h.hashinfo; if (percpu_counter_init(&mptcp_sockets_allocated, 0, GFP_KERNEL)) panic("Failed to allocate MPTCP pcpu counter\n"); mptcp_napi_dev = alloc_netdev_dummy(0); if (!mptcp_napi_dev) panic("Failed to allocate MPTCP dummy netdev\n"); for_each_possible_cpu(cpu) { delegated = per_cpu_ptr(&mptcp_delegated_actions, cpu); INIT_LIST_HEAD(&delegated->head); netif_napi_add_tx(mptcp_napi_dev, &delegated->napi, mptcp_napi_poll); napi_enable(&delegated->napi); } mptcp_subflow_init(); mptcp_pm_init(); mptcp_sched_init(); mptcp_token_init(); if (proto_register(&mptcp_prot, 1) != 0) panic("Failed to register MPTCP proto.\n"); inet_register_protosw(&mptcp_protosw); BUILD_BUG_ON(sizeof(struct mptcp_skb_cb) > sizeof_field(struct sk_buff, cb)); } #if IS_ENABLED(CONFIG_MPTCP_IPV6) static const struct proto_ops mptcp_v6_stream_ops = { .family = PF_INET6, .owner = THIS_MODULE, .release = inet6_release, .bind = mptcp_bind, .connect = inet_stream_connect, .socketpair = sock_no_socketpair, .accept = mptcp_stream_accept, .getname = inet6_getname, .poll = mptcp_poll, .ioctl = inet6_ioctl, .gettstamp = sock_gettstamp, .listen = mptcp_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet6_sendmsg, .recvmsg = inet6_recvmsg, .mmap = sock_no_mmap, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, #endif .set_rcvlowat = mptcp_set_rcvlowat, }; static struct proto mptcp_v6_prot; static struct inet_protosw mptcp_v6_protosw = { .type = SOCK_STREAM, .protocol = IPPROTO_MPTCP, .prot = &mptcp_v6_prot, .ops = &mptcp_v6_stream_ops, .flags = INET_PROTOSW_ICSK, }; int __init mptcp_proto_v6_init(void) { int err; mptcp_v6_prot = mptcp_prot; strscpy(mptcp_v6_prot.name, "MPTCPv6", sizeof(mptcp_v6_prot.name)); mptcp_v6_prot.slab = NULL; mptcp_v6_prot.obj_size = sizeof(struct mptcp6_sock); mptcp_v6_prot.ipv6_pinfo_offset = offsetof(struct mptcp6_sock, np); err = proto_register(&mptcp_v6_prot, 1); if (err) return err; err = inet6_register_protosw(&mptcp_v6_protosw); if (err) proto_unregister(&mptcp_v6_prot); return err; } #endif
3 7825 239 239 7821 7827 7821 181 72 114 114 473 50 169 169 93 92 234 61 232 232 205 205 137 68 426 425 362 63 48 6 1 42 2475 201 2303 126 22 22 5 6 47 12 7 86 66 19 4 3 248 721 720 4 87 599 186 414 18 415 414 54 601 131 4 4 36 36 36 2 2 57 681 8 15 30 8 639 26 3 4 45 1 2 1 4 1 3 3 31 18 1 12 5 26 1 1 2 2 20 7 1 6 2 2 2 1 1 78 1 2 4 2 1 1 69 297 299 204 204 204 96 96 96 481 2 11 9 4 464 136 2 1 1 2 1 1 1 2 1 2 1 2 1 1 2 1 8 4 3 1 1 3 2 93 8 1 16 43 28 4 1 23 27 30 1 27 14 13 2 24 12 11 1 112 6 106 106 3 103 8 8 8 8 12 12 1 1 1 9 9 9 353 353 350 3 13 13 13 5 5389 5390 5387 2097 2442 390 1756 98 194 350 1 1 348 10 17 3 1 8 6 8 6 8 6 2 10 10 11 7 7 5 2 6 5 1 6 6 51 1 50 2 2 2 1 7 3 27 7 7 18 3 29 30 23 4 22 5 12 23 10 13 11 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com * Copyright (c) 2016 Facebook */ #include <linux/kernel.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/bpf.h> #include <linux/bpf_verifier.h> #include <linux/bpf_perf_event.h> #include <linux/btf.h> #include <linux/filter.h> #include <linux/uaccess.h> #include <linux/ctype.h> #include <linux/kprobes.h> #include <linux/spinlock.h> #include <linux/syscalls.h> #include <linux/error-injection.h> #include <linux/btf_ids.h> #include <linux/bpf_lsm.h> #include <linux/fprobe.h> #include <linux/bsearch.h> #include <linux/sort.h> #include <linux/key.h> #include <linux/namei.h> #include <net/bpf_sk_storage.h> #include <uapi/linux/bpf.h> #include <uapi/linux/btf.h> #include <asm/tlb.h> #include "trace_probe.h" #include "trace.h" #define CREATE_TRACE_POINTS #include "bpf_trace.h" #define bpf_event_rcu_dereference(p) \ rcu_dereference_protected(p, lockdep_is_held(&bpf_event_mutex)) #define MAX_UPROBE_MULTI_CNT (1U << 20) #define MAX_KPROBE_MULTI_CNT (1U << 20) #ifdef CONFIG_MODULES struct bpf_trace_module { struct module *module; struct list_head list; }; static LIST_HEAD(bpf_trace_modules); static DEFINE_MUTEX(bpf_module_mutex); static struct bpf_raw_event_map *bpf_get_raw_tracepoint_module(const char *name) { struct bpf_raw_event_map *btp, *ret = NULL; struct bpf_trace_module *btm; unsigned int i; mutex_lock(&bpf_module_mutex); list_for_each_entry(btm, &bpf_trace_modules, list) { for (i = 0; i < btm->module->num_bpf_raw_events; ++i) { btp = &btm->module->bpf_raw_events[i]; if (!strcmp(btp->tp->name, name)) { if (try_module_get(btm->module)) ret = btp; goto out; } } } out: mutex_unlock(&bpf_module_mutex); return ret; } #else static struct bpf_raw_event_map *bpf_get_raw_tracepoint_module(const char *name) { return NULL; } #endif /* CONFIG_MODULES */ u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); static int bpf_btf_printf_prepare(struct btf_ptr *ptr, u32 btf_ptr_size, u64 flags, const struct btf **btf, s32 *btf_id); static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx); static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx); static u64 bpf_uprobe_multi_cookie(struct bpf_run_ctx *ctx); static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx); /** * trace_call_bpf - invoke BPF program * @call: tracepoint event * @ctx: opaque context pointer * * kprobe handlers execute BPF programs via this helper. * Can be used from static tracepoints in the future. * * Return: BPF programs always return an integer which is interpreted by * kprobe handler as: * 0 - return from kprobe (event is filtered out) * 1 - store kprobe event into ring buffer * Other values are reserved and currently alias to 1 */ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) { unsigned int ret; cant_sleep(); if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) { /* * since some bpf program is already running on this cpu, * don't call into another bpf program (same or different) * and don't send kprobe event into ring-buffer, * so return zero here */ rcu_read_lock(); bpf_prog_inc_misses_counters(rcu_dereference(call->prog_array)); rcu_read_unlock(); ret = 0; goto out; } /* * Instead of moving rcu_read_lock/rcu_dereference/rcu_read_unlock * to all call sites, we did a bpf_prog_array_valid() there to check * whether call->prog_array is empty or not, which is * a heuristic to speed up execution. * * If bpf_prog_array_valid() fetched prog_array was * non-NULL, we go into trace_call_bpf() and do the actual * proper rcu_dereference() under RCU lock. * If it turns out that prog_array is NULL then, we bail out. * For the opposite, if the bpf_prog_array_valid() fetched pointer * was NULL, you'll skip the prog_array with the risk of missing * out of events when it was updated in between this and the * rcu_dereference() which is accepted risk. */ rcu_read_lock(); ret = bpf_prog_run_array(rcu_dereference(call->prog_array), ctx, bpf_prog_run); rcu_read_unlock(); out: __this_cpu_dec(bpf_prog_active); return ret; } #ifdef CONFIG_BPF_KPROBE_OVERRIDE BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc) { regs_set_return_value(regs, rc); override_function_with_return(regs); return 0; } static const struct bpf_func_proto bpf_override_return_proto = { .func = bpf_override_return, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; #endif static __always_inline int bpf_probe_read_user_common(void *dst, u32 size, const void __user *unsafe_ptr) { int ret; ret = copy_from_user_nofault(dst, unsafe_ptr, size); if (unlikely(ret < 0)) memset(dst, 0, size); return ret; } BPF_CALL_3(bpf_probe_read_user, void *, dst, u32, size, const void __user *, unsafe_ptr) { return bpf_probe_read_user_common(dst, size, unsafe_ptr); } const struct bpf_func_proto bpf_probe_read_user_proto = { .func = bpf_probe_read_user, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, }; static __always_inline int bpf_probe_read_user_str_common(void *dst, u32 size, const void __user *unsafe_ptr) { int ret; /* * NB: We rely on strncpy_from_user() not copying junk past the NUL * terminator into `dst`. * * strncpy_from_user() does long-sized strides in the fast path. If the * strncpy does not mask out the bytes after the NUL in `unsafe_ptr`, * then there could be junk after the NUL in `dst`. If user takes `dst` * and keys a hash map with it, then semantically identical strings can * occupy multiple entries in the map. */ ret = strncpy_from_user_nofault(dst, unsafe_ptr, size); if (unlikely(ret < 0)) memset(dst, 0, size); return ret; } BPF_CALL_3(bpf_probe_read_user_str, void *, dst, u32, size, const void __user *, unsafe_ptr) { return bpf_probe_read_user_str_common(dst, size, unsafe_ptr); } const struct bpf_func_proto bpf_probe_read_user_str_proto = { .func = bpf_probe_read_user_str, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, }; BPF_CALL_3(bpf_probe_read_kernel, void *, dst, u32, size, const void *, unsafe_ptr) { return bpf_probe_read_kernel_common(dst, size, unsafe_ptr); } const struct bpf_func_proto bpf_probe_read_kernel_proto = { .func = bpf_probe_read_kernel, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, }; static __always_inline int bpf_probe_read_kernel_str_common(void *dst, u32 size, const void *unsafe_ptr) { int ret; /* * The strncpy_from_kernel_nofault() call will likely not fill the * entire buffer, but that's okay in this circumstance as we're probing * arbitrary memory anyway similar to bpf_probe_read_*() and might * as well probe the stack. Thus, memory is explicitly cleared * only in error case, so that improper users ignoring return * code altogether don't copy garbage; otherwise length of string * is returned that can be used for bpf_perf_event_output() et al. */ ret = strncpy_from_kernel_nofault(dst, unsafe_ptr, size); if (unlikely(ret < 0)) memset(dst, 0, size); return ret; } BPF_CALL_3(bpf_probe_read_kernel_str, void *, dst, u32, size, const void *, unsafe_ptr) { return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr); } const struct bpf_func_proto bpf_probe_read_kernel_str_proto = { .func = bpf_probe_read_kernel_str, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, }; #ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE BPF_CALL_3(bpf_probe_read_compat, void *, dst, u32, size, const void *, unsafe_ptr) { if ((unsigned long)unsafe_ptr < TASK_SIZE) { return bpf_probe_read_user_common(dst, size, (__force void __user *)unsafe_ptr); } return bpf_probe_read_kernel_common(dst, size, unsafe_ptr); } static const struct bpf_func_proto bpf_probe_read_compat_proto = { .func = bpf_probe_read_compat, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, }; BPF_CALL_3(bpf_probe_read_compat_str, void *, dst, u32, size, const void *, unsafe_ptr) { if ((unsigned long)unsafe_ptr < TASK_SIZE) { return bpf_probe_read_user_str_common(dst, size, (__force void __user *)unsafe_ptr); } return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr); } static const struct bpf_func_proto bpf_probe_read_compat_str_proto = { .func = bpf_probe_read_compat_str, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, }; #endif /* CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE */ BPF_CALL_3(bpf_probe_write_user, void __user *, unsafe_ptr, const void *, src, u32, size) { /* * Ensure we're in user context which is safe for the helper to * run. This helper has no business in a kthread. * * access_ok() should prevent writing to non-user memory, but in * some situations (nommu, temporary switch, etc) access_ok() does * not provide enough validation, hence the check on KERNEL_DS. * * nmi_uaccess_okay() ensures the probe is not run in an interim * state, when the task or mm are switched. This is specifically * required to prevent the use of temporary mm. */ if (unlikely(in_interrupt() || current->flags & (PF_KTHREAD | PF_EXITING))) return -EPERM; if (unlikely(!nmi_uaccess_okay())) return -EPERM; return copy_to_user_nofault(unsafe_ptr, src, size); } static const struct bpf_func_proto bpf_probe_write_user_proto = { .func = bpf_probe_write_user, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, }; #define MAX_TRACE_PRINTK_VARARGS 3 #define BPF_TRACE_PRINTK_SIZE 1024 BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, u64, arg2, u64, arg3) { u64 args[MAX_TRACE_PRINTK_VARARGS] = { arg1, arg2, arg3 }; struct bpf_bprintf_data data = { .get_bin_args = true, .get_buf = true, }; int ret; ret = bpf_bprintf_prepare(fmt, fmt_size, args, MAX_TRACE_PRINTK_VARARGS, &data); if (ret < 0) return ret; ret = bstr_printf(data.buf, MAX_BPRINTF_BUF, fmt, data.bin_args); trace_bpf_trace_printk(data.buf); bpf_bprintf_cleanup(&data); return ret; } static const struct bpf_func_proto bpf_trace_printk_proto = { .func = bpf_trace_printk, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, }; static void __set_printk_clr_event(struct work_struct *work) { /* * This program might be calling bpf_trace_printk, * so enable the associated bpf_trace/bpf_trace_printk event. * Repeat this each time as it is possible a user has * disabled bpf_trace_printk events. By loading a program * calling bpf_trace_printk() however the user has expressed * the intent to see such events. */ if (trace_set_clr_event("bpf_trace", "bpf_trace_printk", 1)) pr_warn_ratelimited("could not enable bpf_trace_printk events"); } static DECLARE_WORK(set_printk_work, __set_printk_clr_event); const struct bpf_func_proto *bpf_get_trace_printk_proto(void) { schedule_work(&set_printk_work); return &bpf_trace_printk_proto; } BPF_CALL_4(bpf_trace_vprintk, char *, fmt, u32, fmt_size, const void *, args, u32, data_len) { struct bpf_bprintf_data data = { .get_bin_args = true, .get_buf = true, }; int ret, num_args; if (data_len & 7 || data_len > MAX_BPRINTF_VARARGS * 8 || (data_len && !args)) return -EINVAL; num_args = data_len / 8; ret = bpf_bprintf_prepare(fmt, fmt_size, args, num_args, &data); if (ret < 0) return ret; ret = bstr_printf(data.buf, MAX_BPRINTF_BUF, fmt, data.bin_args); trace_bpf_trace_printk(data.buf); bpf_bprintf_cleanup(&data); return ret; } static const struct bpf_func_proto bpf_trace_vprintk_proto = { .func = bpf_trace_vprintk, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE_OR_ZERO, }; const struct bpf_func_proto *bpf_get_trace_vprintk_proto(void) { schedule_work(&set_printk_work); return &bpf_trace_vprintk_proto; } BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size, const void *, args, u32, data_len) { struct bpf_bprintf_data data = { .get_bin_args = true, }; int err, num_args; if (data_len & 7 || data_len > MAX_BPRINTF_VARARGS * 8 || (data_len && !args)) return -EINVAL; num_args = data_len / 8; err = bpf_bprintf_prepare(fmt, fmt_size, args, num_args, &data); if (err < 0) return err; seq_bprintf(m, fmt, data.bin_args); bpf_bprintf_cleanup(&data); return seq_has_overflowed(m) ? -EOVERFLOW : 0; } BTF_ID_LIST_SINGLE(btf_seq_file_ids, struct, seq_file) static const struct bpf_func_proto bpf_seq_printf_proto = { .func = bpf_seq_printf, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &btf_seq_file_ids[0], .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; BPF_CALL_3(bpf_seq_write, struct seq_file *, m, const void *, data, u32, len) { return seq_write(m, data, len) ? -EOVERFLOW : 0; } static const struct bpf_func_proto bpf_seq_write_proto = { .func = bpf_seq_write, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &btf_seq_file_ids[0], .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, }; BPF_CALL_4(bpf_seq_printf_btf, struct seq_file *, m, struct btf_ptr *, ptr, u32, btf_ptr_size, u64, flags) { const struct btf *btf; s32 btf_id; int ret; ret = bpf_btf_printf_prepare(ptr, btf_ptr_size, flags, &btf, &btf_id); if (ret) return ret; return btf_type_seq_show_flags(btf, btf_id, ptr->ptr, m, flags); } static const struct bpf_func_proto bpf_seq_printf_btf_proto = { .func = bpf_seq_printf_btf, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &btf_seq_file_ids[0], .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; static __always_inline int get_map_perf_counter(struct bpf_map *map, u64 flags, u64 *value, u64 *enabled, u64 *running) { struct bpf_array *array = container_of(map, struct bpf_array, map); unsigned int cpu = smp_processor_id(); u64 index = flags & BPF_F_INDEX_MASK; struct bpf_event_entry *ee; if (unlikely(flags & ~(BPF_F_INDEX_MASK))) return -EINVAL; if (index == BPF_F_CURRENT_CPU) index = cpu; if (unlikely(index >= array->map.max_entries)) return -E2BIG; ee = READ_ONCE(array->ptrs[index]); if (!ee) return -ENOENT; return perf_event_read_local(ee->event, value, enabled, running); } BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags) { u64 value = 0; int err; err = get_map_perf_counter(map, flags, &value, NULL, NULL); /* * this api is ugly since we miss [-22..-2] range of valid * counter values, but that's uapi */ if (err) return err; return value; } const struct bpf_func_proto bpf_perf_event_read_proto = { .func = bpf_perf_event_read, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_perf_event_read_value, struct bpf_map *, map, u64, flags, struct bpf_perf_event_value *, buf, u32, size) { int err = -EINVAL; if (unlikely(size != sizeof(struct bpf_perf_event_value))) goto clear; err = get_map_perf_counter(map, flags, &buf->counter, &buf->enabled, &buf->running); if (unlikely(err)) goto clear; return 0; clear: memset(buf, 0, size); return err; } static const struct bpf_func_proto bpf_perf_event_read_value_proto = { .func = bpf_perf_event_read_value, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, }; const struct bpf_func_proto *bpf_get_perf_event_read_value_proto(void) { return &bpf_perf_event_read_value_proto; } static __always_inline u64 __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, u64 flags, struct perf_raw_record *raw, struct perf_sample_data *sd) { struct bpf_array *array = container_of(map, struct bpf_array, map); unsigned int cpu = smp_processor_id(); u64 index = flags & BPF_F_INDEX_MASK; struct bpf_event_entry *ee; struct perf_event *event; if (index == BPF_F_CURRENT_CPU) index = cpu; if (unlikely(index >= array->map.max_entries)) return -E2BIG; ee = READ_ONCE(array->ptrs[index]); if (!ee) return -ENOENT; event = ee->event; if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE || event->attr.config != PERF_COUNT_SW_BPF_OUTPUT)) return -EINVAL; if (unlikely(event->oncpu != cpu)) return -EOPNOTSUPP; perf_sample_save_raw_data(sd, event, raw); return perf_event_output(event, sd, regs); } /* * Support executing tracepoints in normal, irq, and nmi context that each call * bpf_perf_event_output */ struct bpf_trace_sample_data { struct perf_sample_data sds[3]; }; static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_trace_sds); static DEFINE_PER_CPU(int, bpf_trace_nest_level); BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, u64, flags, void *, data, u64, size) { struct bpf_trace_sample_data *sds; struct perf_raw_record raw = { .frag = { .size = size, .data = data, }, }; struct perf_sample_data *sd; int nest_level, err; preempt_disable(); sds = this_cpu_ptr(&bpf_trace_sds); nest_level = this_cpu_inc_return(bpf_trace_nest_level); if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(sds->sds))) { err = -EBUSY; goto out; } sd = &sds->sds[nest_level - 1]; if (unlikely(flags & ~(BPF_F_INDEX_MASK))) { err = -EINVAL; goto out; } perf_sample_data_init(sd, 0, 0); err = __bpf_perf_event_output(regs, map, flags, &raw, sd); out: this_cpu_dec(bpf_trace_nest_level); preempt_enable(); return err; } static const struct bpf_func_proto bpf_perf_event_output_proto = { .func = bpf_perf_event_output, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; static DEFINE_PER_CPU(int, bpf_event_output_nest_level); struct bpf_nested_pt_regs { struct pt_regs regs[3]; }; static DEFINE_PER_CPU(struct bpf_nested_pt_regs, bpf_pt_regs); static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_misc_sds); u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy) { struct perf_raw_frag frag = { .copy = ctx_copy, .size = ctx_size, .data = ctx, }; struct perf_raw_record raw = { .frag = { { .next = ctx_size ? &frag : NULL, }, .size = meta_size, .data = meta, }, }; struct perf_sample_data *sd; struct pt_regs *regs; int nest_level; u64 ret; preempt_disable(); nest_level = this_cpu_inc_return(bpf_event_output_nest_level); if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(bpf_misc_sds.sds))) { ret = -EBUSY; goto out; } sd = this_cpu_ptr(&bpf_misc_sds.sds[nest_level - 1]); regs = this_cpu_ptr(&bpf_pt_regs.regs[nest_level - 1]); perf_fetch_caller_regs(regs); perf_sample_data_init(sd, 0, 0); ret = __bpf_perf_event_output(regs, map, flags, &raw, sd); out: this_cpu_dec(bpf_event_output_nest_level); preempt_enable(); return ret; } BPF_CALL_0(bpf_get_current_task) { return (long) current; } const struct bpf_func_proto bpf_get_current_task_proto = { .func = bpf_get_current_task, .gpl_only = true, .ret_type = RET_INTEGER, }; BPF_CALL_0(bpf_get_current_task_btf) { return (unsigned long) current; } const struct bpf_func_proto bpf_get_current_task_btf_proto = { .func = bpf_get_current_task_btf, .gpl_only = true, .ret_type = RET_PTR_TO_BTF_ID_TRUSTED, .ret_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], }; BPF_CALL_1(bpf_task_pt_regs, struct task_struct *, task) { return (unsigned long) task_pt_regs(task); } BTF_ID_LIST_SINGLE(bpf_task_pt_regs_ids, struct, pt_regs) const struct bpf_func_proto bpf_task_pt_regs_proto = { .func = bpf_task_pt_regs, .gpl_only = true, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], .ret_type = RET_PTR_TO_BTF_ID, .ret_btf_id = &bpf_task_pt_regs_ids[0], }; struct send_signal_irq_work { struct irq_work irq_work; struct task_struct *task; u32 sig; enum pid_type type; bool has_siginfo; struct kernel_siginfo info; }; static DEFINE_PER_CPU(struct send_signal_irq_work, send_signal_work); static void do_bpf_send_signal(struct irq_work *entry) { struct send_signal_irq_work *work; struct kernel_siginfo *siginfo; work = container_of(entry, struct send_signal_irq_work, irq_work); siginfo = work->has_siginfo ? &work->info : SEND_SIG_PRIV; group_send_sig_info(work->sig, siginfo, work->task, work->type); put_task_struct(work->task); } static int bpf_send_signal_common(u32 sig, enum pid_type type, struct task_struct *task, u64 value) { struct send_signal_irq_work *work = NULL; struct kernel_siginfo info; struct kernel_siginfo *siginfo; if (!task) { task = current; siginfo = SEND_SIG_PRIV; } else { clear_siginfo(&info); info.si_signo = sig; info.si_errno = 0; info.si_code = SI_KERNEL; info.si_pid = 0; info.si_uid = 0; info.si_value.sival_ptr = (void *)(unsigned long)value; siginfo = &info; } /* Similar to bpf_probe_write_user, task needs to be * in a sound condition and kernel memory access be * permitted in order to send signal to the current * task. */ if (unlikely(task->flags & (PF_KTHREAD | PF_EXITING))) return -EPERM; if (unlikely(!nmi_uaccess_okay())) return -EPERM; /* Task should not be pid=1 to avoid kernel panic. */ if (unlikely(is_global_init(task))) return -EPERM; if (preempt_count() != 0 || irqs_disabled()) { /* Do an early check on signal validity. Otherwise, * the error is lost in deferred irq_work. */ if (unlikely(!valid_signal(sig))) return -EINVAL; work = this_cpu_ptr(&send_signal_work); if (irq_work_is_busy(&work->irq_work)) return -EBUSY; /* Add the current task, which is the target of sending signal, * to the irq_work. The current task may change when queued * irq works get executed. */ work->task = get_task_struct(task); work->has_siginfo = siginfo == &info; if (work->has_siginfo) copy_siginfo(&work->info, &info); work->sig = sig; work->type = type; irq_work_queue(&work->irq_work); return 0; } return group_send_sig_info(sig, siginfo, task, type); } BPF_CALL_1(bpf_send_signal, u32, sig) { return bpf_send_signal_common(sig, PIDTYPE_TGID, NULL, 0); } const struct bpf_func_proto bpf_send_signal_proto = { .func = bpf_send_signal, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, }; BPF_CALL_1(bpf_send_signal_thread, u32, sig) { return bpf_send_signal_common(sig, PIDTYPE_PID, NULL, 0); } const struct bpf_func_proto bpf_send_signal_thread_proto = { .func = bpf_send_signal_thread, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, }; BPF_CALL_3(bpf_d_path, const struct path *, path, char *, buf, u32, sz) { struct path copy; long len; char *p; if (!sz) return 0; /* * The path pointer is verified as trusted and safe to use, * but let's double check it's valid anyway to workaround * potentially broken verifier. */ len = copy_from_kernel_nofault(&copy, path, sizeof(*path)); if (len < 0) return len; p = d_path(&copy, buf, sz); if (IS_ERR(p)) { len = PTR_ERR(p); } else { len = buf + sz - p; memmove(buf, p, len); } return len; } BTF_SET_START(btf_allowlist_d_path) #ifdef CONFIG_SECURITY BTF_ID(func, security_file_permission) BTF_ID(func, security_inode_getattr) BTF_ID(func, security_file_open) #endif #ifdef CONFIG_SECURITY_PATH BTF_ID(func, security_path_truncate) #endif BTF_ID(func, vfs_truncate) BTF_ID(func, vfs_fallocate) BTF_ID(func, dentry_open) BTF_ID(func, vfs_getattr) BTF_ID(func, filp_close) BTF_SET_END(btf_allowlist_d_path) static bool bpf_d_path_allowed(const struct bpf_prog *prog) { if (prog->type == BPF_PROG_TYPE_TRACING && prog->expected_attach_type == BPF_TRACE_ITER) return true; if (prog->type == BPF_PROG_TYPE_LSM) return bpf_lsm_is_sleepable_hook(prog->aux->attach_btf_id); return btf_id_set_contains(&btf_allowlist_d_path, prog->aux->attach_btf_id); } BTF_ID_LIST_SINGLE(bpf_d_path_btf_ids, struct, path) static const struct bpf_func_proto bpf_d_path_proto = { .func = bpf_d_path, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &bpf_d_path_btf_ids[0], .arg2_type = ARG_PTR_TO_MEM | MEM_WRITE, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .allowed = bpf_d_path_allowed, }; #define BTF_F_ALL (BTF_F_COMPACT | BTF_F_NONAME | \ BTF_F_PTR_RAW | BTF_F_ZERO) static int bpf_btf_printf_prepare(struct btf_ptr *ptr, u32 btf_ptr_size, u64 flags, const struct btf **btf, s32 *btf_id) { const struct btf_type *t; if (unlikely(flags & ~(BTF_F_ALL))) return -EINVAL; if (btf_ptr_size != sizeof(struct btf_ptr)) return -EINVAL; *btf = bpf_get_btf_vmlinux(); if (IS_ERR_OR_NULL(*btf)) return IS_ERR(*btf) ? PTR_ERR(*btf) : -EINVAL; if (ptr->type_id > 0) *btf_id = ptr->type_id; else return -EINVAL; if (*btf_id > 0) t = btf_type_by_id(*btf, *btf_id); if (*btf_id <= 0 || !t) return -ENOENT; return 0; } BPF_CALL_5(bpf_snprintf_btf, char *, str, u32, str_size, struct btf_ptr *, ptr, u32, btf_ptr_size, u64, flags) { const struct btf *btf; s32 btf_id; int ret; ret = bpf_btf_printf_prepare(ptr, btf_ptr_size, flags, &btf, &btf_id); if (ret) return ret; return btf_type_snprintf_show(btf, btf_id, ptr->ptr, str, str_size, flags); } const struct bpf_func_proto bpf_snprintf_btf_proto = { .func = bpf_snprintf_btf, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_MEM, .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE, .arg5_type = ARG_ANYTHING, }; BPF_CALL_1(bpf_get_func_ip_tracing, void *, ctx) { /* This helper call is inlined by verifier. */ return ((u64 *)ctx)[-2]; } static const struct bpf_func_proto bpf_get_func_ip_proto_tracing = { .func = bpf_get_func_ip_tracing, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; static inline unsigned long get_entry_ip(unsigned long fentry_ip) { #ifdef CONFIG_X86_KERNEL_IBT if (is_endbr((void *)(fentry_ip - ENDBR_INSN_SIZE))) fentry_ip -= ENDBR_INSN_SIZE; #endif return fentry_ip; } BPF_CALL_1(bpf_get_func_ip_kprobe, struct pt_regs *, regs) { struct bpf_trace_run_ctx *run_ctx __maybe_unused; struct kprobe *kp; #ifdef CONFIG_UPROBES run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx); if (run_ctx->is_uprobe) return ((struct uprobe_dispatch_data *)current->utask->vaddr)->bp_addr; #endif kp = kprobe_running(); if (!kp || !(kp->flags & KPROBE_FLAG_ON_FUNC_ENTRY)) return 0; return get_entry_ip((uintptr_t)kp->addr); } static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe = { .func = bpf_get_func_ip_kprobe, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_1(bpf_get_func_ip_kprobe_multi, struct pt_regs *, regs) { return bpf_kprobe_multi_entry_ip(current->bpf_ctx); } static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe_multi = { .func = bpf_get_func_ip_kprobe_multi, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_1(bpf_get_attach_cookie_kprobe_multi, struct pt_regs *, regs) { return bpf_kprobe_multi_cookie(current->bpf_ctx); } static const struct bpf_func_proto bpf_get_attach_cookie_proto_kmulti = { .func = bpf_get_attach_cookie_kprobe_multi, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_1(bpf_get_func_ip_uprobe_multi, struct pt_regs *, regs) { return bpf_uprobe_multi_entry_ip(current->bpf_ctx); } static const struct bpf_func_proto bpf_get_func_ip_proto_uprobe_multi = { .func = bpf_get_func_ip_uprobe_multi, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_1(bpf_get_attach_cookie_uprobe_multi, struct pt_regs *, regs) { return bpf_uprobe_multi_cookie(current->bpf_ctx); } static const struct bpf_func_proto bpf_get_attach_cookie_proto_umulti = { .func = bpf_get_attach_cookie_uprobe_multi, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_1(bpf_get_attach_cookie_trace, void *, ctx) { struct bpf_trace_run_ctx *run_ctx; run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx); return run_ctx->bpf_cookie; } static const struct bpf_func_proto bpf_get_attach_cookie_proto_trace = { .func = bpf_get_attach_cookie_trace, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_1(bpf_get_attach_cookie_pe, struct bpf_perf_event_data_kern *, ctx) { return ctx->event->bpf_cookie; } static const struct bpf_func_proto bpf_get_attach_cookie_proto_pe = { .func = bpf_get_attach_cookie_pe, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_1(bpf_get_attach_cookie_tracing, void *, ctx) { struct bpf_trace_run_ctx *run_ctx; run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx); return run_ctx->bpf_cookie; } static const struct bpf_func_proto bpf_get_attach_cookie_proto_tracing = { .func = bpf_get_attach_cookie_tracing, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_3(bpf_get_branch_snapshot, void *, buf, u32, size, u64, flags) { static const u32 br_entry_size = sizeof(struct perf_branch_entry); u32 entry_cnt = size / br_entry_size; entry_cnt = static_call(perf_snapshot_branch_stack)(buf, entry_cnt); if (unlikely(flags)) return -EINVAL; if (!entry_cnt) return -ENOENT; return entry_cnt * br_entry_size; } const struct bpf_func_proto bpf_get_branch_snapshot_proto = { .func = bpf_get_branch_snapshot, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, }; BPF_CALL_3(get_func_arg, void *, ctx, u32, n, u64 *, value) { /* This helper call is inlined by verifier. */ u64 nr_args = ((u64 *)ctx)[-1]; if ((u64) n >= nr_args) return -EINVAL; *value = ((u64 *)ctx)[n]; return 0; } static const struct bpf_func_proto bpf_get_func_arg_proto = { .func = get_func_arg, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED, .arg3_size = sizeof(u64), }; BPF_CALL_2(get_func_ret, void *, ctx, u64 *, value) { /* This helper call is inlined by verifier. */ u64 nr_args = ((u64 *)ctx)[-1]; *value = ((u64 *)ctx)[nr_args]; return 0; } static const struct bpf_func_proto bpf_get_func_ret_proto = { .func = get_func_ret, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED, .arg2_size = sizeof(u64), }; BPF_CALL_1(get_func_arg_cnt, void *, ctx) { /* This helper call is inlined by verifier. */ return ((u64 *)ctx)[-1]; } static const struct bpf_func_proto bpf_get_func_arg_cnt_proto = { .func = get_func_arg_cnt, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; static const struct bpf_func_proto * bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { const struct bpf_func_proto *func_proto; switch (func_id) { case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; #ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE case BPF_FUNC_probe_read: return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? NULL : &bpf_probe_read_compat_proto; case BPF_FUNC_probe_read_str: return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? NULL : &bpf_probe_read_compat_str_proto; #endif case BPF_FUNC_get_func_ip: return &bpf_get_func_ip_proto_tracing; default: break; } func_proto = bpf_base_func_proto(func_id, prog); if (func_proto) return func_proto; if (!bpf_token_capable(prog->aux->token, CAP_SYS_ADMIN)) return NULL; switch (func_id) { case BPF_FUNC_probe_write_user: return security_locked_down(LOCKDOWN_BPF_WRITE_USER) < 0 ? NULL : &bpf_probe_write_user_proto; default: return NULL; } } static bool is_kprobe_multi(const struct bpf_prog *prog) { return prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI || prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION; } static inline bool is_kprobe_session(const struct bpf_prog *prog) { return prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION; } static inline bool is_uprobe_multi(const struct bpf_prog *prog) { return prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI || prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION; } static inline bool is_uprobe_session(const struct bpf_prog *prog) { return prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION; } static const struct bpf_func_proto * kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_perf_event_output: return &bpf_perf_event_output_proto; case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto; case BPF_FUNC_get_stack: return prog->sleepable ? &bpf_get_stack_sleepable_proto : &bpf_get_stack_proto; #ifdef CONFIG_BPF_KPROBE_OVERRIDE case BPF_FUNC_override_return: return &bpf_override_return_proto; #endif case BPF_FUNC_get_func_ip: if (is_kprobe_multi(prog)) return &bpf_get_func_ip_proto_kprobe_multi; if (is_uprobe_multi(prog)) return &bpf_get_func_ip_proto_uprobe_multi; return &bpf_get_func_ip_proto_kprobe; case BPF_FUNC_get_attach_cookie: if (is_kprobe_multi(prog)) return &bpf_get_attach_cookie_proto_kmulti; if (is_uprobe_multi(prog)) return &bpf_get_attach_cookie_proto_umulti; return &bpf_get_attach_cookie_proto_trace; default: return bpf_tracing_func_proto(func_id, prog); } } /* bpf+kprobe programs can access fields of 'struct pt_regs' */ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (off < 0 || off >= sizeof(struct pt_regs)) return false; if (off % size != 0) return false; /* * Assertion for 32 bit to make sure last 8 byte access * (BPF_DW) to the last 4 byte member is disallowed. */ if (off + size > sizeof(struct pt_regs)) return false; if (type == BPF_WRITE) prog->aux->kprobe_write_ctx = true; return true; } const struct bpf_verifier_ops kprobe_verifier_ops = { .get_func_proto = kprobe_prog_func_proto, .is_valid_access = kprobe_prog_is_valid_access, }; const struct bpf_prog_ops kprobe_prog_ops = { }; BPF_CALL_5(bpf_perf_event_output_tp, void *, tp_buff, struct bpf_map *, map, u64, flags, void *, data, u64, size) { struct pt_regs *regs = *(struct pt_regs **)tp_buff; /* * r1 points to perf tracepoint buffer where first 8 bytes are hidden * from bpf program and contain a pointer to 'struct pt_regs'. Fetch it * from there and call the same bpf_perf_event_output() helper inline. */ return ____bpf_perf_event_output(regs, map, flags, data, size); } static const struct bpf_func_proto bpf_perf_event_output_proto_tp = { .func = bpf_perf_event_output_tp, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; BPF_CALL_3(bpf_get_stackid_tp, void *, tp_buff, struct bpf_map *, map, u64, flags) { struct pt_regs *regs = *(struct pt_regs **)tp_buff; /* * Same comment as in bpf_perf_event_output_tp(), only that this time * the other helper's function body cannot be inlined due to being * external, thus we need to call raw helper function. */ return bpf_get_stackid((unsigned long) regs, (unsigned long) map, flags, 0, 0); } static const struct bpf_func_proto bpf_get_stackid_proto_tp = { .func = bpf_get_stackid_tp, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_get_stack_tp, void *, tp_buff, void *, buf, u32, size, u64, flags) { struct pt_regs *regs = *(struct pt_regs **)tp_buff; return bpf_get_stack((unsigned long) regs, (unsigned long) buf, (unsigned long) size, flags, 0); } static const struct bpf_func_proto bpf_get_stack_proto_tp = { .func = bpf_get_stack_tp, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; static const struct bpf_func_proto * tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_perf_event_output: return &bpf_perf_event_output_proto_tp; case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto_tp; case BPF_FUNC_get_stack: return &bpf_get_stack_proto_tp; case BPF_FUNC_get_attach_cookie: return &bpf_get_attach_cookie_proto_trace; default: return bpf_tracing_func_proto(func_id, prog); } } static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE) return false; if (type != BPF_READ) return false; if (off % size != 0) return false; BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(__u64)); return true; } const struct bpf_verifier_ops tracepoint_verifier_ops = { .get_func_proto = tp_prog_func_proto, .is_valid_access = tp_prog_is_valid_access, }; const struct bpf_prog_ops tracepoint_prog_ops = { }; BPF_CALL_3(bpf_perf_prog_read_value, struct bpf_perf_event_data_kern *, ctx, struct bpf_perf_event_value *, buf, u32, size) { int err = -EINVAL; if (unlikely(size != sizeof(struct bpf_perf_event_value))) goto clear; err = perf_event_read_local(ctx->event, &buf->counter, &buf->enabled, &buf->running); if (unlikely(err)) goto clear; return 0; clear: memset(buf, 0, size); return err; } static const struct bpf_func_proto bpf_perf_prog_read_value_proto = { .func = bpf_perf_prog_read_value, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE, }; BPF_CALL_4(bpf_read_branch_records, struct bpf_perf_event_data_kern *, ctx, void *, buf, u32, size, u64, flags) { static const u32 br_entry_size = sizeof(struct perf_branch_entry); struct perf_branch_stack *br_stack = ctx->data->br_stack; u32 to_copy; if (unlikely(flags & ~BPF_F_GET_BRANCH_RECORDS_SIZE)) return -EINVAL; if (unlikely(!(ctx->data->sample_flags & PERF_SAMPLE_BRANCH_STACK))) return -ENOENT; if (unlikely(!br_stack)) return -ENOENT; if (flags & BPF_F_GET_BRANCH_RECORDS_SIZE) return br_stack->nr * br_entry_size; if (!buf || (size % br_entry_size != 0)) return -EINVAL; to_copy = min_t(u32, br_stack->nr * br_entry_size, size); memcpy(buf, br_stack->entries, to_copy); return to_copy; } static const struct bpf_func_proto bpf_read_branch_records_proto = { .func = bpf_read_branch_records, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM_OR_NULL, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; static const struct bpf_func_proto * pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_perf_event_output: return &bpf_perf_event_output_proto_tp; case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto_pe; case BPF_FUNC_get_stack: return &bpf_get_stack_proto_pe; case BPF_FUNC_perf_prog_read_value: return &bpf_perf_prog_read_value_proto; case BPF_FUNC_read_branch_records: return &bpf_read_branch_records_proto; case BPF_FUNC_get_attach_cookie: return &bpf_get_attach_cookie_proto_pe; default: return bpf_tracing_func_proto(func_id, prog); } } /* * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp * to avoid potential recursive reuse issue when/if tracepoints are added * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack. * * Since raw tracepoints run despite bpf_prog_active, support concurrent usage * in normal, irq, and nmi context. */ struct bpf_raw_tp_regs { struct pt_regs regs[3]; }; static DEFINE_PER_CPU(struct bpf_raw_tp_regs, bpf_raw_tp_regs); static DEFINE_PER_CPU(int, bpf_raw_tp_nest_level); static struct pt_regs *get_bpf_raw_tp_regs(void) { struct bpf_raw_tp_regs *tp_regs = this_cpu_ptr(&bpf_raw_tp_regs); int nest_level = this_cpu_inc_return(bpf_raw_tp_nest_level); if (nest_level > ARRAY_SIZE(tp_regs->regs)) { this_cpu_dec(bpf_raw_tp_nest_level); return ERR_PTR(-EBUSY); } return &tp_regs->regs[nest_level - 1]; } static void put_bpf_raw_tp_regs(void) { this_cpu_dec(bpf_raw_tp_nest_level); } BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args, struct bpf_map *, map, u64, flags, void *, data, u64, size) { struct pt_regs *regs = get_bpf_raw_tp_regs(); int ret; if (IS_ERR(regs)) return PTR_ERR(regs); perf_fetch_caller_regs(regs); ret = ____bpf_perf_event_output(regs, map, flags, data, size); put_bpf_raw_tp_regs(); return ret; } static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = { .func = bpf_perf_event_output_raw_tp, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; extern const struct bpf_func_proto bpf_skb_output_proto; extern const struct bpf_func_proto bpf_xdp_output_proto; extern const struct bpf_func_proto bpf_xdp_get_buff_len_trace_proto; BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args, struct bpf_map *, map, u64, flags) { struct pt_regs *regs = get_bpf_raw_tp_regs(); int ret; if (IS_ERR(regs)) return PTR_ERR(regs); perf_fetch_caller_regs(regs); /* similar to bpf_perf_event_output_tp, but pt_regs fetched differently */ ret = bpf_get_stackid((unsigned long) regs, (unsigned long) map, flags, 0, 0); put_bpf_raw_tp_regs(); return ret; } static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = { .func = bpf_get_stackid_raw_tp, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args, void *, buf, u32, size, u64, flags) { struct pt_regs *regs = get_bpf_raw_tp_regs(); int ret; if (IS_ERR(regs)) return PTR_ERR(regs); perf_fetch_caller_regs(regs); ret = bpf_get_stack((unsigned long) regs, (unsigned long) buf, (unsigned long) size, flags, 0); put_bpf_raw_tp_regs(); return ret; } static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = { .func = bpf_get_stack_raw_tp, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; static const struct bpf_func_proto * raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_perf_event_output: return &bpf_perf_event_output_proto_raw_tp; case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto_raw_tp; case BPF_FUNC_get_stack: return &bpf_get_stack_proto_raw_tp; case BPF_FUNC_get_attach_cookie: return &bpf_get_attach_cookie_proto_tracing; default: return bpf_tracing_func_proto(func_id, prog); } } const struct bpf_func_proto * tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { const struct bpf_func_proto *fn; switch (func_id) { #ifdef CONFIG_NET case BPF_FUNC_skb_output: return &bpf_skb_output_proto; case BPF_FUNC_xdp_output: return &bpf_xdp_output_proto; case BPF_FUNC_skc_to_tcp6_sock: return &bpf_skc_to_tcp6_sock_proto; case BPF_FUNC_skc_to_tcp_sock: return &bpf_skc_to_tcp_sock_proto; case BPF_FUNC_skc_to_tcp_timewait_sock: return &bpf_skc_to_tcp_timewait_sock_proto; case BPF_FUNC_skc_to_tcp_request_sock: return &bpf_skc_to_tcp_request_sock_proto; case BPF_FUNC_skc_to_udp6_sock: return &bpf_skc_to_udp6_sock_proto; case BPF_FUNC_skc_to_unix_sock: return &bpf_skc_to_unix_sock_proto; case BPF_FUNC_skc_to_mptcp_sock: return &bpf_skc_to_mptcp_sock_proto; case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_tracing_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_tracing_proto; case BPF_FUNC_sock_from_file: return &bpf_sock_from_file_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_ptr_cookie_proto; case BPF_FUNC_xdp_get_buff_len: return &bpf_xdp_get_buff_len_trace_proto; #endif case BPF_FUNC_seq_printf: return prog->expected_attach_type == BPF_TRACE_ITER ? &bpf_seq_printf_proto : NULL; case BPF_FUNC_seq_write: return prog->expected_attach_type == BPF_TRACE_ITER ? &bpf_seq_write_proto : NULL; case BPF_FUNC_seq_printf_btf: return prog->expected_attach_type == BPF_TRACE_ITER ? &bpf_seq_printf_btf_proto : NULL; case BPF_FUNC_d_path: return &bpf_d_path_proto; case BPF_FUNC_get_func_arg: return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_proto : NULL; case BPF_FUNC_get_func_ret: return bpf_prog_has_trampoline(prog) ? &bpf_get_func_ret_proto : NULL; case BPF_FUNC_get_func_arg_cnt: return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_cnt_proto : NULL; case BPF_FUNC_get_attach_cookie: if (prog->type == BPF_PROG_TYPE_TRACING && prog->expected_attach_type == BPF_TRACE_RAW_TP) return &bpf_get_attach_cookie_proto_tracing; return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto_tracing : NULL; default: fn = raw_tp_prog_func_proto(func_id, prog); if (!fn && prog->expected_attach_type == BPF_TRACE_ITER) fn = bpf_iter_get_func_proto(func_id, prog); return fn; } } static bool raw_tp_prog_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { return bpf_tracing_ctx_access(off, size, type); } static bool tracing_prog_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { return bpf_tracing_btf_ctx_access(off, size, type, prog, info); } int __weak bpf_prog_test_run_tracing(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { return -ENOTSUPP; } const struct bpf_verifier_ops raw_tracepoint_verifier_ops = { .get_func_proto = raw_tp_prog_func_proto, .is_valid_access = raw_tp_prog_is_valid_access, }; const struct bpf_prog_ops raw_tracepoint_prog_ops = { #ifdef CONFIG_NET .test_run = bpf_prog_test_run_raw_tp, #endif }; const struct bpf_verifier_ops tracing_verifier_ops = { .get_func_proto = tracing_prog_func_proto, .is_valid_access = tracing_prog_is_valid_access, }; const struct bpf_prog_ops tracing_prog_ops = { .test_run = bpf_prog_test_run_tracing, }; static bool raw_tp_writable_prog_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (off == 0) { if (size != sizeof(u64) || type != BPF_READ) return false; info->reg_type = PTR_TO_TP_BUFFER; } return raw_tp_prog_is_valid_access(off, size, type, prog, info); } const struct bpf_verifier_ops raw_tracepoint_writable_verifier_ops = { .get_func_proto = raw_tp_prog_func_proto, .is_valid_access = raw_tp_writable_prog_is_valid_access, }; const struct bpf_prog_ops raw_tracepoint_writable_prog_ops = { }; static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const int size_u64 = sizeof(u64); if (off < 0 || off >= sizeof(struct bpf_perf_event_data)) return false; if (type != BPF_READ) return false; if (off % size != 0) { if (sizeof(unsigned long) != 4) return false; if (size != 8) return false; if (off % size != 4) return false; } switch (off) { case bpf_ctx_range(struct bpf_perf_event_data, sample_period): bpf_ctx_record_field_size(info, size_u64); if (!bpf_ctx_narrow_access_ok(off, size, size_u64)) return false; break; case bpf_ctx_range(struct bpf_perf_event_data, addr): bpf_ctx_record_field_size(info, size_u64); if (!bpf_ctx_narrow_access_ok(off, size, size_u64)) return false; break; default: if (size != sizeof(long)) return false; } return true; } static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; switch (si->off) { case offsetof(struct bpf_perf_event_data, sample_period): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, data), si->dst_reg, si->src_reg, offsetof(struct bpf_perf_event_data_kern, data)); *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg, bpf_target_off(struct perf_sample_data, period, 8, target_size)); break; case offsetof(struct bpf_perf_event_data, addr): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, data), si->dst_reg, si->src_reg, offsetof(struct bpf_perf_event_data_kern, data)); *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg, bpf_target_off(struct perf_sample_data, addr, 8, target_size)); break; default: *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, regs), si->dst_reg, si->src_reg, offsetof(struct bpf_perf_event_data_kern, regs)); *insn++ = BPF_LDX_MEM(BPF_SIZEOF(long), si->dst_reg, si->dst_reg, si->off); break; } return insn - insn_buf; } const struct bpf_verifier_ops perf_event_verifier_ops = { .get_func_proto = pe_prog_func_proto, .is_valid_access = pe_prog_is_valid_access, .convert_ctx_access = pe_prog_convert_ctx_access, }; const struct bpf_prog_ops perf_event_prog_ops = { }; static DEFINE_MUTEX(bpf_event_mutex); #define BPF_TRACE_MAX_PROGS 64 int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie) { struct bpf_prog_array *old_array; struct bpf_prog_array *new_array; int ret = -EEXIST; /* * Kprobe override only works if they are on the function entry, * and only if they are on the opt-in list. */ if (prog->kprobe_override && (!trace_kprobe_on_func_entry(event->tp_event) || !trace_kprobe_error_injectable(event->tp_event))) return -EINVAL; mutex_lock(&bpf_event_mutex); if (event->prog) goto unlock; old_array = bpf_event_rcu_dereference(event->tp_event->prog_array); if (old_array && bpf_prog_array_length(old_array) >= BPF_TRACE_MAX_PROGS) { ret = -E2BIG; goto unlock; } ret = bpf_prog_array_copy(old_array, NULL, prog, bpf_cookie, &new_array); if (ret < 0) goto unlock; /* set the new array to event->tp_event and set event->prog */ event->prog = prog; event->bpf_cookie = bpf_cookie; rcu_assign_pointer(event->tp_event->prog_array, new_array); bpf_prog_array_free_sleepable(old_array); unlock: mutex_unlock(&bpf_event_mutex); return ret; } void perf_event_detach_bpf_prog(struct perf_event *event) { struct bpf_prog_array *old_array; struct bpf_prog_array *new_array; struct bpf_prog *prog = NULL; int ret; mutex_lock(&bpf_event_mutex); if (!event->prog) goto unlock; old_array = bpf_event_rcu_dereference(event->tp_event->prog_array); if (!old_array) goto put; ret = bpf_prog_array_copy(old_array, event->prog, NULL, 0, &new_array); if (ret < 0) { bpf_prog_array_delete_safe(old_array, event->prog); } else { rcu_assign_pointer(event->tp_event->prog_array, new_array); bpf_prog_array_free_sleepable(old_array); } put: prog = event->prog; event->prog = NULL; unlock: mutex_unlock(&bpf_event_mutex); if (prog) { /* * It could be that the bpf_prog is not sleepable (and will be freed * via normal RCU), but is called from a point that supports sleepable * programs and uses tasks-trace-RCU. */ synchronize_rcu_tasks_trace(); bpf_prog_put(prog); } } int perf_event_query_prog_array(struct perf_event *event, void __user *info) { struct perf_event_query_bpf __user *uquery = info; struct perf_event_query_bpf query = {}; struct bpf_prog_array *progs; u32 *ids, prog_cnt, ids_len; int ret; if (!perfmon_capable()) return -EPERM; if (event->attr.type != PERF_TYPE_TRACEPOINT) return -EINVAL; if (copy_from_user(&query, uquery, sizeof(query))) return -EFAULT; ids_len = query.ids_len; if (ids_len > BPF_TRACE_MAX_PROGS) return -E2BIG; ids = kcalloc(ids_len, sizeof(u32), GFP_USER | __GFP_NOWARN); if (!ids) return -ENOMEM; /* * The above kcalloc returns ZERO_SIZE_PTR when ids_len = 0, which * is required when user only wants to check for uquery->prog_cnt. * There is no need to check for it since the case is handled * gracefully in bpf_prog_array_copy_info. */ mutex_lock(&bpf_event_mutex); progs = bpf_event_rcu_dereference(event->tp_event->prog_array); ret = bpf_prog_array_copy_info(progs, ids, ids_len, &prog_cnt); mutex_unlock(&bpf_event_mutex); if (copy_to_user(&uquery->prog_cnt, &prog_cnt, sizeof(prog_cnt)) || copy_to_user(uquery->ids, ids, ids_len * sizeof(u32))) ret = -EFAULT; kfree(ids); return ret; } extern struct bpf_raw_event_map __start__bpf_raw_tp[]; extern struct bpf_raw_event_map __stop__bpf_raw_tp[]; struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name) { struct bpf_raw_event_map *btp = __start__bpf_raw_tp; for (; btp < __stop__bpf_raw_tp; btp++) { if (!strcmp(btp->tp->name, name)) return btp; } return bpf_get_raw_tracepoint_module(name); } void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp) { struct module *mod; guard(rcu)(); mod = __module_address((unsigned long)btp); module_put(mod); } static __always_inline void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args) { struct bpf_prog *prog = link->link.prog; struct bpf_run_ctx *old_run_ctx; struct bpf_trace_run_ctx run_ctx; cant_sleep(); if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { bpf_prog_inc_misses_counter(prog); goto out; } run_ctx.bpf_cookie = link->cookie; old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); rcu_read_lock(); (void) bpf_prog_run(prog, args); rcu_read_unlock(); bpf_reset_run_ctx(old_run_ctx); out: this_cpu_dec(*(prog->active)); } #define UNPACK(...) __VA_ARGS__ #define REPEAT_1(FN, DL, X, ...) FN(X) #define REPEAT_2(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_1(FN, DL, __VA_ARGS__) #define REPEAT_3(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_2(FN, DL, __VA_ARGS__) #define REPEAT_4(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_3(FN, DL, __VA_ARGS__) #define REPEAT_5(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_4(FN, DL, __VA_ARGS__) #define REPEAT_6(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_5(FN, DL, __VA_ARGS__) #define REPEAT_7(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_6(FN, DL, __VA_ARGS__) #define REPEAT_8(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_7(FN, DL, __VA_ARGS__) #define REPEAT_9(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_8(FN, DL, __VA_ARGS__) #define REPEAT_10(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_9(FN, DL, __VA_ARGS__) #define REPEAT_11(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_10(FN, DL, __VA_ARGS__) #define REPEAT_12(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_11(FN, DL, __VA_ARGS__) #define REPEAT(X, FN, DL, ...) REPEAT_##X(FN, DL, __VA_ARGS__) #define SARG(X) u64 arg##X #define COPY(X) args[X] = arg##X #define __DL_COM (,) #define __DL_SEM (;) #define __SEQ_0_11 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 #define BPF_TRACE_DEFN_x(x) \ void bpf_trace_run##x(struct bpf_raw_tp_link *link, \ REPEAT(x, SARG, __DL_COM, __SEQ_0_11)) \ { \ u64 args[x]; \ REPEAT(x, COPY, __DL_SEM, __SEQ_0_11); \ __bpf_trace_run(link, args); \ } \ EXPORT_SYMBOL_GPL(bpf_trace_run##x) BPF_TRACE_DEFN_x(1); BPF_TRACE_DEFN_x(2); BPF_TRACE_DEFN_x(3); BPF_TRACE_DEFN_x(4); BPF_TRACE_DEFN_x(5); BPF_TRACE_DEFN_x(6); BPF_TRACE_DEFN_x(7); BPF_TRACE_DEFN_x(8); BPF_TRACE_DEFN_x(9); BPF_TRACE_DEFN_x(10); BPF_TRACE_DEFN_x(11); BPF_TRACE_DEFN_x(12); int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link) { struct tracepoint *tp = btp->tp; struct bpf_prog *prog = link->link.prog; /* * check that program doesn't access arguments beyond what's * available in this tracepoint */ if (prog->aux->max_ctx_offset > btp->num_args * sizeof(u64)) return -EINVAL; if (prog->aux->max_tp_access > btp->writable_size) return -EINVAL; return tracepoint_probe_register_may_exist(tp, (void *)btp->bpf_func, link); } int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link) { return tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, link); } int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, u32 *fd_type, const char **buf, u64 *probe_offset, u64 *probe_addr, unsigned long *missed) { bool is_tracepoint, is_syscall_tp; struct bpf_prog *prog; int flags, err = 0; prog = event->prog; if (!prog) return -ENOENT; /* not supporting BPF_PROG_TYPE_PERF_EVENT yet */ if (prog->type == BPF_PROG_TYPE_PERF_EVENT) return -EOPNOTSUPP; *prog_id = prog->aux->id; flags = event->tp_event->flags; is_tracepoint = flags & TRACE_EVENT_FL_TRACEPOINT; is_syscall_tp = is_syscall_trace_event(event->tp_event); if (is_tracepoint || is_syscall_tp) { *buf = is_tracepoint ? event->tp_event->tp->name : event->tp_event->name; /* We allow NULL pointer for tracepoint */ if (fd_type) *fd_type = BPF_FD_TYPE_TRACEPOINT; if (probe_offset) *probe_offset = 0x0; if (probe_addr) *probe_addr = 0x0; } else { /* kprobe/uprobe */ err = -EOPNOTSUPP; #ifdef CONFIG_KPROBE_EVENTS if (flags & TRACE_EVENT_FL_KPROBE) err = bpf_get_kprobe_info(event, fd_type, buf, probe_offset, probe_addr, missed, event->attr.type == PERF_TYPE_TRACEPOINT); #endif #ifdef CONFIG_UPROBE_EVENTS if (flags & TRACE_EVENT_FL_UPROBE) err = bpf_get_uprobe_info(event, fd_type, buf, probe_offset, probe_addr, event->attr.type == PERF_TYPE_TRACEPOINT); #endif } return err; } static int __init send_signal_irq_work_init(void) { int cpu; struct send_signal_irq_work *work; for_each_possible_cpu(cpu) { work = per_cpu_ptr(&send_signal_work, cpu); init_irq_work(&work->irq_work, do_bpf_send_signal); } return 0; } subsys_initcall(send_signal_irq_work_init); #ifdef CONFIG_MODULES static int bpf_event_notify(struct notifier_block *nb, unsigned long op, void *module) { struct bpf_trace_module *btm, *tmp; struct module *mod = module; int ret = 0; if (mod->num_bpf_raw_events == 0 || (op != MODULE_STATE_COMING && op != MODULE_STATE_GOING)) goto out; mutex_lock(&bpf_module_mutex); switch (op) { case MODULE_STATE_COMING: btm = kzalloc(sizeof(*btm), GFP_KERNEL); if (btm) { btm->module = module; list_add(&btm->list, &bpf_trace_modules); } else { ret = -ENOMEM; } break; case MODULE_STATE_GOING: list_for_each_entry_safe(btm, tmp, &bpf_trace_modules, list) { if (btm->module == module) { list_del(&btm->list); kfree(btm); break; } } break; } mutex_unlock(&bpf_module_mutex); out: return notifier_from_errno(ret); } static struct notifier_block bpf_module_nb = { .notifier_call = bpf_event_notify, }; static int __init bpf_event_init(void) { register_module_notifier(&bpf_module_nb); return 0; } fs_initcall(bpf_event_init); #endif /* CONFIG_MODULES */ struct bpf_session_run_ctx { struct bpf_run_ctx run_ctx; bool is_return; void *data; }; #ifdef CONFIG_FPROBE struct bpf_kprobe_multi_link { struct bpf_link link; struct fprobe fp; unsigned long *addrs; u64 *cookies; u32 cnt; u32 mods_cnt; struct module **mods; }; struct bpf_kprobe_multi_run_ctx { struct bpf_session_run_ctx session_ctx; struct bpf_kprobe_multi_link *link; unsigned long entry_ip; }; struct user_syms { const char **syms; char *buf; }; #ifndef CONFIG_HAVE_FTRACE_REGS_HAVING_PT_REGS static DEFINE_PER_CPU(struct pt_regs, bpf_kprobe_multi_pt_regs); #define bpf_kprobe_multi_pt_regs_ptr() this_cpu_ptr(&bpf_kprobe_multi_pt_regs) #else #define bpf_kprobe_multi_pt_regs_ptr() (NULL) #endif static unsigned long ftrace_get_entry_ip(unsigned long fentry_ip) { unsigned long ip = ftrace_get_symaddr(fentry_ip); return ip ? : fentry_ip; } static int copy_user_syms(struct user_syms *us, unsigned long __user *usyms, u32 cnt) { unsigned long __user usymbol; const char **syms = NULL; char *buf = NULL, *p; int err = -ENOMEM; unsigned int i; syms = kvmalloc_array(cnt, sizeof(*syms), GFP_KERNEL); if (!syms) goto error; buf = kvmalloc_array(cnt, KSYM_NAME_LEN, GFP_KERNEL); if (!buf) goto error; for (p = buf, i = 0; i < cnt; i++) { if (__get_user(usymbol, usyms + i)) { err = -EFAULT; goto error; } err = strncpy_from_user(p, (const char __user *) usymbol, KSYM_NAME_LEN); if (err == KSYM_NAME_LEN) err = -E2BIG; if (err < 0) goto error; syms[i] = p; p += err + 1; } us->syms = syms; us->buf = buf; return 0; error: if (err) { kvfree(syms); kvfree(buf); } return err; } static void kprobe_multi_put_modules(struct module **mods, u32 cnt) { u32 i; for (i = 0; i < cnt; i++) module_put(mods[i]); } static void free_user_syms(struct user_syms *us) { kvfree(us->syms); kvfree(us->buf); } static void bpf_kprobe_multi_link_release(struct bpf_link *link) { struct bpf_kprobe_multi_link *kmulti_link; kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link); unregister_fprobe(&kmulti_link->fp); kprobe_multi_put_modules(kmulti_link->mods, kmulti_link->mods_cnt); } static void bpf_kprobe_multi_link_dealloc(struct bpf_link *link) { struct bpf_kprobe_multi_link *kmulti_link; kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link); kvfree(kmulti_link->addrs); kvfree(kmulti_link->cookies); kfree(kmulti_link->mods); kfree(kmulti_link); } static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link, struct bpf_link_info *info) { u64 __user *ucookies = u64_to_user_ptr(info->kprobe_multi.cookies); u64 __user *uaddrs = u64_to_user_ptr(info->kprobe_multi.addrs); struct bpf_kprobe_multi_link *kmulti_link; u32 ucount = info->kprobe_multi.count; int err = 0, i; if (!uaddrs ^ !ucount) return -EINVAL; if (ucookies && !ucount) return -EINVAL; kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link); info->kprobe_multi.count = kmulti_link->cnt; info->kprobe_multi.flags = kmulti_link->link.flags; info->kprobe_multi.missed = kmulti_link->fp.nmissed; if (!uaddrs) return 0; if (ucount < kmulti_link->cnt) err = -ENOSPC; else ucount = kmulti_link->cnt; if (ucookies) { if (kmulti_link->cookies) { if (copy_to_user(ucookies, kmulti_link->cookies, ucount * sizeof(u64))) return -EFAULT; } else { for (i = 0; i < ucount; i++) { if (put_user(0, ucookies + i)) return -EFAULT; } } } if (kallsyms_show_value(current_cred())) { if (copy_to_user(uaddrs, kmulti_link->addrs, ucount * sizeof(u64))) return -EFAULT; } else { for (i = 0; i < ucount; i++) { if (put_user(0, uaddrs + i)) return -EFAULT; } } return err; } #ifdef CONFIG_PROC_FS static void bpf_kprobe_multi_show_fdinfo(const struct bpf_link *link, struct seq_file *seq) { struct bpf_kprobe_multi_link *kmulti_link; kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link); seq_printf(seq, "kprobe_cnt:\t%u\n" "missed:\t%lu\n", kmulti_link->cnt, kmulti_link->fp.nmissed); seq_printf(seq, "%s\t %s\n", "cookie", "func"); for (int i = 0; i < kmulti_link->cnt; i++) { seq_printf(seq, "%llu\t %pS\n", kmulti_link->cookies[i], (void *)kmulti_link->addrs[i]); } } #endif static const struct bpf_link_ops bpf_kprobe_multi_link_lops = { .release = bpf_kprobe_multi_link_release, .dealloc_deferred = bpf_kprobe_multi_link_dealloc, .fill_link_info = bpf_kprobe_multi_link_fill_link_info, #ifdef CONFIG_PROC_FS .show_fdinfo = bpf_kprobe_multi_show_fdinfo, #endif }; static void bpf_kprobe_multi_cookie_swap(void *a, void *b, int size, const void *priv) { const struct bpf_kprobe_multi_link *link = priv; unsigned long *addr_a = a, *addr_b = b; u64 *cookie_a, *cookie_b; cookie_a = link->cookies + (addr_a - link->addrs); cookie_b = link->cookies + (addr_b - link->addrs); /* swap addr_a/addr_b and cookie_a/cookie_b values */ swap(*addr_a, *addr_b); swap(*cookie_a, *cookie_b); } static int bpf_kprobe_multi_addrs_cmp(const void *a, const void *b) { const unsigned long *addr_a = a, *addr_b = b; if (*addr_a == *addr_b) return 0; return *addr_a < *addr_b ? -1 : 1; } static int bpf_kprobe_multi_cookie_cmp(const void *a, const void *b, const void *priv) { return bpf_kprobe_multi_addrs_cmp(a, b); } static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx) { struct bpf_kprobe_multi_run_ctx *run_ctx; struct bpf_kprobe_multi_link *link; u64 *cookie, entry_ip; unsigned long *addr; if (WARN_ON_ONCE(!ctx)) return 0; run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx, session_ctx.run_ctx); link = run_ctx->link; if (!link->cookies) return 0; entry_ip = run_ctx->entry_ip; addr = bsearch(&entry_ip, link->addrs, link->cnt, sizeof(entry_ip), bpf_kprobe_multi_addrs_cmp); if (!addr) return 0; cookie = link->cookies + (addr - link->addrs); return *cookie; } static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx) { struct bpf_kprobe_multi_run_ctx *run_ctx; run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx, session_ctx.run_ctx); return run_ctx->entry_ip; } static __always_inline int kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link, unsigned long entry_ip, struct ftrace_regs *fregs, bool is_return, void *data) { struct bpf_kprobe_multi_run_ctx run_ctx = { .session_ctx = { .is_return = is_return, .data = data, }, .link = link, .entry_ip = entry_ip, }; struct bpf_run_ctx *old_run_ctx; struct pt_regs *regs; int err; /* * graph tracer framework ensures we won't migrate, so there is no need * to use migrate_disable for bpf_prog_run again. The check here just for * __this_cpu_inc_return. */ cant_sleep(); if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) { bpf_prog_inc_misses_counter(link->link.prog); err = 1; goto out; } rcu_read_lock(); regs = ftrace_partial_regs(fregs, bpf_kprobe_multi_pt_regs_ptr()); old_run_ctx = bpf_set_run_ctx(&run_ctx.session_ctx.run_ctx); err = bpf_prog_run(link->link.prog, regs); bpf_reset_run_ctx(old_run_ctx); rcu_read_unlock(); out: __this_cpu_dec(bpf_prog_active); return err; } static int kprobe_multi_link_handler(struct fprobe *fp, unsigned long fentry_ip, unsigned long ret_ip, struct ftrace_regs *fregs, void *data) { struct bpf_kprobe_multi_link *link; int err; link = container_of(fp, struct bpf_kprobe_multi_link, fp); err = kprobe_multi_link_prog_run(link, ftrace_get_entry_ip(fentry_ip), fregs, false, data); return is_kprobe_session(link->link.prog) ? err : 0; } static void kprobe_multi_link_exit_handler(struct fprobe *fp, unsigned long fentry_ip, unsigned long ret_ip, struct ftrace_regs *fregs, void *data) { struct bpf_kprobe_multi_link *link; link = container_of(fp, struct bpf_kprobe_multi_link, fp); kprobe_multi_link_prog_run(link, ftrace_get_entry_ip(fentry_ip), fregs, true, data); } static int symbols_cmp_r(const void *a, const void *b, const void *priv) { const char **str_a = (const char **) a; const char **str_b = (const char **) b; return strcmp(*str_a, *str_b); } struct multi_symbols_sort { const char **funcs; u64 *cookies; }; static void symbols_swap_r(void *a, void *b, int size, const void *priv) { const struct multi_symbols_sort *data = priv; const char **name_a = a, **name_b = b; swap(*name_a, *name_b); /* If defined, swap also related cookies. */ if (data->cookies) { u64 *cookie_a, *cookie_b; cookie_a = data->cookies + (name_a - data->funcs); cookie_b = data->cookies + (name_b - data->funcs); swap(*cookie_a, *cookie_b); } } struct modules_array { struct module **mods; int mods_cnt; int mods_cap; }; static int add_module(struct modules_array *arr, struct module *mod) { struct module **mods; if (arr->mods_cnt == arr->mods_cap) { arr->mods_cap = max(16, arr->mods_cap * 3 / 2); mods = krealloc_array(arr->mods, arr->mods_cap, sizeof(*mods), GFP_KERNEL); if (!mods) return -ENOMEM; arr->mods = mods; } arr->mods[arr->mods_cnt] = mod; arr->mods_cnt++; return 0; } static bool has_module(struct modules_array *arr, struct module *mod) { int i; for (i = arr->mods_cnt - 1; i >= 0; i--) { if (arr->mods[i] == mod) return true; } return false; } static int get_modules_for_addrs(struct module ***mods, unsigned long *addrs, u32 addrs_cnt) { struct modules_array arr = {}; u32 i, err = 0; for (i = 0; i < addrs_cnt; i++) { bool skip_add = false; struct module *mod; scoped_guard(rcu) { mod = __module_address(addrs[i]); /* Either no module or it's already stored */ if (!mod || has_module(&arr, mod)) { skip_add = true; break; /* scoped_guard */ } if (!try_module_get(mod)) err = -EINVAL; } if (skip_add) continue; if (err) break; err = add_module(&arr, mod); if (err) { module_put(mod); break; } } /* We return either err < 0 in case of error, ... */ if (err) { kprobe_multi_put_modules(arr.mods, arr.mods_cnt); kfree(arr.mods); return err; } /* or number of modules found if everything is ok. */ *mods = arr.mods; return arr.mods_cnt; } static int addrs_check_error_injection_list(unsigned long *addrs, u32 cnt) { u32 i; for (i = 0; i < cnt; i++) { if (!within_error_injection_list(addrs[i])) return -EINVAL; } return 0; } int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { struct bpf_kprobe_multi_link *link = NULL; struct bpf_link_primer link_primer; void __user *ucookies; unsigned long *addrs; u32 flags, cnt, size; void __user *uaddrs; u64 *cookies = NULL; void __user *usyms; int err; /* no support for 32bit archs yet */ if (sizeof(u64) != sizeof(void *)) return -EOPNOTSUPP; if (attr->link_create.flags) return -EINVAL; if (!is_kprobe_multi(prog)) return -EINVAL; /* Writing to context is not allowed for kprobes. */ if (prog->aux->kprobe_write_ctx) return -EINVAL; flags = attr->link_create.kprobe_multi.flags; if (flags & ~BPF_F_KPROBE_MULTI_RETURN) return -EINVAL; uaddrs = u64_to_user_ptr(attr->link_create.kprobe_multi.addrs); usyms = u64_to_user_ptr(attr->link_create.kprobe_multi.syms); if (!!uaddrs == !!usyms) return -EINVAL; cnt = attr->link_create.kprobe_multi.cnt; if (!cnt) return -EINVAL; if (cnt > MAX_KPROBE_MULTI_CNT) return -E2BIG; size = cnt * sizeof(*addrs); addrs = kvmalloc_array(cnt, sizeof(*addrs), GFP_KERNEL); if (!addrs) return -ENOMEM; ucookies = u64_to_user_ptr(attr->link_create.kprobe_multi.cookies); if (ucookies) { cookies = kvmalloc_array(cnt, sizeof(*addrs), GFP_KERNEL); if (!cookies) { err = -ENOMEM; goto error; } if (copy_from_user(cookies, ucookies, size)) { err = -EFAULT; goto error; } } if (uaddrs) { if (copy_from_user(addrs, uaddrs, size)) { err = -EFAULT; goto error; } } else { struct multi_symbols_sort data = { .cookies = cookies, }; struct user_syms us; err = copy_user_syms(&us, usyms, cnt); if (err) goto error; if (cookies) data.funcs = us.syms; sort_r(us.syms, cnt, sizeof(*us.syms), symbols_cmp_r, symbols_swap_r, &data); err = ftrace_lookup_symbols(us.syms, cnt, addrs); free_user_syms(&us); if (err) goto error; } if (prog->kprobe_override && addrs_check_error_injection_list(addrs, cnt)) { err = -EINVAL; goto error; } link = kzalloc(sizeof(*link), GFP_KERNEL); if (!link) { err = -ENOMEM; goto error; } bpf_link_init(&link->link, BPF_LINK_TYPE_KPROBE_MULTI, &bpf_kprobe_multi_link_lops, prog, attr->link_create.attach_type); err = bpf_link_prime(&link->link, &link_primer); if (err) goto error; if (!(flags & BPF_F_KPROBE_MULTI_RETURN)) link->fp.entry_handler = kprobe_multi_link_handler; if ((flags & BPF_F_KPROBE_MULTI_RETURN) || is_kprobe_session(prog)) link->fp.exit_handler = kprobe_multi_link_exit_handler; if (is_kprobe_session(prog)) link->fp.entry_data_size = sizeof(u64); link->addrs = addrs; link->cookies = cookies; link->cnt = cnt; link->link.flags = flags; if (cookies) { /* * Sorting addresses will trigger sorting cookies as well * (check bpf_kprobe_multi_cookie_swap). This way we can * find cookie based on the address in bpf_get_attach_cookie * helper. */ sort_r(addrs, cnt, sizeof(*addrs), bpf_kprobe_multi_cookie_cmp, bpf_kprobe_multi_cookie_swap, link); } err = get_modules_for_addrs(&link->mods, addrs, cnt); if (err < 0) { bpf_link_cleanup(&link_primer); return err; } link->mods_cnt = err; err = register_fprobe_ips(&link->fp, addrs, cnt); if (err) { kprobe_multi_put_modules(link->mods, link->mods_cnt); bpf_link_cleanup(&link_primer); return err; } return bpf_link_settle(&link_primer); error: kfree(link); kvfree(addrs); kvfree(cookies); return err; } #else /* !CONFIG_FPROBE */ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { return -EOPNOTSUPP; } static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx) { return 0; } static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx) { return 0; } #endif #ifdef CONFIG_UPROBES struct bpf_uprobe_multi_link; struct bpf_uprobe { struct bpf_uprobe_multi_link *link; loff_t offset; unsigned long ref_ctr_offset; u64 cookie; struct uprobe *uprobe; struct uprobe_consumer consumer; bool session; }; struct bpf_uprobe_multi_link { struct path path; struct bpf_link link; u32 cnt; struct bpf_uprobe *uprobes; struct task_struct *task; }; struct bpf_uprobe_multi_run_ctx { struct bpf_session_run_ctx session_ctx; unsigned long entry_ip; struct bpf_uprobe *uprobe; }; static void bpf_uprobe_unregister(struct bpf_uprobe *uprobes, u32 cnt) { u32 i; for (i = 0; i < cnt; i++) uprobe_unregister_nosync(uprobes[i].uprobe, &uprobes[i].consumer); if (cnt) uprobe_unregister_sync(); } static void bpf_uprobe_multi_link_release(struct bpf_link *link) { struct bpf_uprobe_multi_link *umulti_link; umulti_link = container_of(link, struct bpf_uprobe_multi_link, link); bpf_uprobe_unregister(umulti_link->uprobes, umulti_link->cnt); if (umulti_link->task) put_task_struct(umulti_link->task); path_put(&umulti_link->path); } static void bpf_uprobe_multi_link_dealloc(struct bpf_link *link) { struct bpf_uprobe_multi_link *umulti_link; umulti_link = container_of(link, struct bpf_uprobe_multi_link, link); kvfree(umulti_link->uprobes); kfree(umulti_link); } static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link, struct bpf_link_info *info) { u64 __user *uref_ctr_offsets = u64_to_user_ptr(info->uprobe_multi.ref_ctr_offsets); u64 __user *ucookies = u64_to_user_ptr(info->uprobe_multi.cookies); u64 __user *uoffsets = u64_to_user_ptr(info->uprobe_multi.offsets); u64 __user *upath = u64_to_user_ptr(info->uprobe_multi.path); u32 upath_size = info->uprobe_multi.path_size; struct bpf_uprobe_multi_link *umulti_link; u32 ucount = info->uprobe_multi.count; int err = 0, i; char *p, *buf; long left = 0; if (!upath ^ !upath_size) return -EINVAL; if ((uoffsets || uref_ctr_offsets || ucookies) && !ucount) return -EINVAL; umulti_link = container_of(link, struct bpf_uprobe_multi_link, link); info->uprobe_multi.count = umulti_link->cnt; info->uprobe_multi.flags = umulti_link->link.flags; info->uprobe_multi.pid = umulti_link->task ? task_pid_nr_ns(umulti_link->task, task_active_pid_ns(current)) : 0; upath_size = upath_size ? min_t(u32, upath_size, PATH_MAX) : PATH_MAX; buf = kmalloc(upath_size, GFP_KERNEL); if (!buf) return -ENOMEM; p = d_path(&umulti_link->path, buf, upath_size); if (IS_ERR(p)) { kfree(buf); return PTR_ERR(p); } upath_size = buf + upath_size - p; if (upath) left = copy_to_user(upath, p, upath_size); kfree(buf); if (left) return -EFAULT; info->uprobe_multi.path_size = upath_size; if (!uoffsets && !ucookies && !uref_ctr_offsets) return 0; if (ucount < umulti_link->cnt) err = -ENOSPC; else ucount = umulti_link->cnt; for (i = 0; i < ucount; i++) { if (uoffsets && put_user(umulti_link->uprobes[i].offset, uoffsets + i)) return -EFAULT; if (uref_ctr_offsets && put_user(umulti_link->uprobes[i].ref_ctr_offset, uref_ctr_offsets + i)) return -EFAULT; if (ucookies && put_user(umulti_link->uprobes[i].cookie, ucookies + i)) return -EFAULT; } return err; } #ifdef CONFIG_PROC_FS static void bpf_uprobe_multi_show_fdinfo(const struct bpf_link *link, struct seq_file *seq) { struct bpf_uprobe_multi_link *umulti_link; char *p, *buf; pid_t pid; umulti_link = container_of(link, struct bpf_uprobe_multi_link, link); buf = kmalloc(PATH_MAX, GFP_KERNEL); if (!buf) return; p = d_path(&umulti_link->path, buf, PATH_MAX); if (IS_ERR(p)) { kfree(buf); return; } pid = umulti_link->task ? task_pid_nr_ns(umulti_link->task, task_active_pid_ns(current)) : 0; seq_printf(seq, "uprobe_cnt:\t%u\n" "pid:\t%u\n" "path:\t%s\n", umulti_link->cnt, pid, p); seq_printf(seq, "%s\t %s\t %s\n", "cookie", "offset", "ref_ctr_offset"); for (int i = 0; i < umulti_link->cnt; i++) { seq_printf(seq, "%llu\t %#llx\t %#lx\n", umulti_link->uprobes[i].cookie, umulti_link->uprobes[i].offset, umulti_link->uprobes[i].ref_ctr_offset); } kfree(buf); } #endif static const struct bpf_link_ops bpf_uprobe_multi_link_lops = { .release = bpf_uprobe_multi_link_release, .dealloc_deferred = bpf_uprobe_multi_link_dealloc, .fill_link_info = bpf_uprobe_multi_link_fill_link_info, #ifdef CONFIG_PROC_FS .show_fdinfo = bpf_uprobe_multi_show_fdinfo, #endif }; static int uprobe_prog_run(struct bpf_uprobe *uprobe, unsigned long entry_ip, struct pt_regs *regs, bool is_return, void *data) { struct bpf_uprobe_multi_link *link = uprobe->link; struct bpf_uprobe_multi_run_ctx run_ctx = { .session_ctx = { .is_return = is_return, .data = data, }, .entry_ip = entry_ip, .uprobe = uprobe, }; struct bpf_prog *prog = link->link.prog; bool sleepable = prog->sleepable; struct bpf_run_ctx *old_run_ctx; int err; if (link->task && !same_thread_group(current, link->task)) return 0; if (sleepable) rcu_read_lock_trace(); else rcu_read_lock(); migrate_disable(); old_run_ctx = bpf_set_run_ctx(&run_ctx.session_ctx.run_ctx); err = bpf_prog_run(link->link.prog, regs); bpf_reset_run_ctx(old_run_ctx); migrate_enable(); if (sleepable) rcu_read_unlock_trace(); else rcu_read_unlock(); return err; } static bool uprobe_multi_link_filter(struct uprobe_consumer *con, struct mm_struct *mm) { struct bpf_uprobe *uprobe; uprobe = container_of(con, struct bpf_uprobe, consumer); return uprobe->link->task->mm == mm; } static int uprobe_multi_link_handler(struct uprobe_consumer *con, struct pt_regs *regs, __u64 *data) { struct bpf_uprobe *uprobe; int ret; uprobe = container_of(con, struct bpf_uprobe, consumer); ret = uprobe_prog_run(uprobe, instruction_pointer(regs), regs, false, data); if (uprobe->session) return ret ? UPROBE_HANDLER_IGNORE : 0; return 0; } static int uprobe_multi_link_ret_handler(struct uprobe_consumer *con, unsigned long func, struct pt_regs *regs, __u64 *data) { struct bpf_uprobe *uprobe; uprobe = container_of(con, struct bpf_uprobe, consumer); uprobe_prog_run(uprobe, func, regs, true, data); return 0; } static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx) { struct bpf_uprobe_multi_run_ctx *run_ctx; run_ctx = container_of(current->bpf_ctx, struct bpf_uprobe_multi_run_ctx, session_ctx.run_ctx); return run_ctx->entry_ip; } static u64 bpf_uprobe_multi_cookie(struct bpf_run_ctx *ctx) { struct bpf_uprobe_multi_run_ctx *run_ctx; run_ctx = container_of(current->bpf_ctx, struct bpf_uprobe_multi_run_ctx, session_ctx.run_ctx); return run_ctx->uprobe->cookie; } int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { struct bpf_uprobe_multi_link *link = NULL; unsigned long __user *uref_ctr_offsets; struct bpf_link_primer link_primer; struct bpf_uprobe *uprobes = NULL; struct task_struct *task = NULL; unsigned long __user *uoffsets; u64 __user *ucookies; void __user *upath; u32 flags, cnt, i; struct path path; char *name; pid_t pid; int err; /* no support for 32bit archs yet */ if (sizeof(u64) != sizeof(void *)) return -EOPNOTSUPP; if (attr->link_create.flags) return -EINVAL; if (!is_uprobe_multi(prog)) return -EINVAL; flags = attr->link_create.uprobe_multi.flags; if (flags & ~BPF_F_UPROBE_MULTI_RETURN) return -EINVAL; /* * path, offsets and cnt are mandatory, * ref_ctr_offsets and cookies are optional */ upath = u64_to_user_ptr(attr->link_create.uprobe_multi.path); uoffsets = u64_to_user_ptr(attr->link_create.uprobe_multi.offsets); cnt = attr->link_create.uprobe_multi.cnt; pid = attr->link_create.uprobe_multi.pid; if (!upath || !uoffsets || !cnt || pid < 0) return -EINVAL; if (cnt > MAX_UPROBE_MULTI_CNT) return -E2BIG; uref_ctr_offsets = u64_to_user_ptr(attr->link_create.uprobe_multi.ref_ctr_offsets); ucookies = u64_to_user_ptr(attr->link_create.uprobe_multi.cookies); name = strndup_user(upath, PATH_MAX); if (IS_ERR(name)) { err = PTR_ERR(name); return err; } err = kern_path(name, LOOKUP_FOLLOW, &path); kfree(name); if (err) return err; if (!d_is_reg(path.dentry)) { err = -EBADF; goto error_path_put; } if (pid) { rcu_read_lock(); task = get_pid_task(find_vpid(pid), PIDTYPE_TGID); rcu_read_unlock(); if (!task) { err = -ESRCH; goto error_path_put; } } err = -ENOMEM; link = kzalloc(sizeof(*link), GFP_KERNEL); uprobes = kvcalloc(cnt, sizeof(*uprobes), GFP_KERNEL); if (!uprobes || !link) goto error_free; for (i = 0; i < cnt; i++) { if (__get_user(uprobes[i].offset, uoffsets + i)) { err = -EFAULT; goto error_free; } if (uprobes[i].offset < 0) { err = -EINVAL; goto error_free; } if (uref_ctr_offsets && __get_user(uprobes[i].ref_ctr_offset, uref_ctr_offsets + i)) { err = -EFAULT; goto error_free; } if (ucookies && __get_user(uprobes[i].cookie, ucookies + i)) { err = -EFAULT; goto error_free; } uprobes[i].link = link; if (!(flags & BPF_F_UPROBE_MULTI_RETURN)) uprobes[i].consumer.handler = uprobe_multi_link_handler; if (flags & BPF_F_UPROBE_MULTI_RETURN || is_uprobe_session(prog)) uprobes[i].consumer.ret_handler = uprobe_multi_link_ret_handler; if (is_uprobe_session(prog)) uprobes[i].session = true; if (pid) uprobes[i].consumer.filter = uprobe_multi_link_filter; } link->cnt = cnt; link->uprobes = uprobes; link->path = path; link->task = task; link->link.flags = flags; bpf_link_init(&link->link, BPF_LINK_TYPE_UPROBE_MULTI, &bpf_uprobe_multi_link_lops, prog, attr->link_create.attach_type); for (i = 0; i < cnt; i++) { uprobes[i].uprobe = uprobe_register(d_real_inode(link->path.dentry), uprobes[i].offset, uprobes[i].ref_ctr_offset, &uprobes[i].consumer); if (IS_ERR(uprobes[i].uprobe)) { err = PTR_ERR(uprobes[i].uprobe); link->cnt = i; goto error_unregister; } } err = bpf_link_prime(&link->link, &link_primer); if (err) goto error_unregister; return bpf_link_settle(&link_primer); error_unregister: bpf_uprobe_unregister(uprobes, link->cnt); error_free: kvfree(uprobes); kfree(link); if (task) put_task_struct(task); error_path_put: path_put(&path); return err; } #else /* !CONFIG_UPROBES */ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { return -EOPNOTSUPP; } static u64 bpf_uprobe_multi_cookie(struct bpf_run_ctx *ctx) { return 0; } static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx) { return 0; } #endif /* CONFIG_UPROBES */ __bpf_kfunc_start_defs(); __bpf_kfunc bool bpf_session_is_return(void) { struct bpf_session_run_ctx *session_ctx; session_ctx = container_of(current->bpf_ctx, struct bpf_session_run_ctx, run_ctx); return session_ctx->is_return; } __bpf_kfunc __u64 *bpf_session_cookie(void) { struct bpf_session_run_ctx *session_ctx; session_ctx = container_of(current->bpf_ctx, struct bpf_session_run_ctx, run_ctx); return session_ctx->data; } __bpf_kfunc_end_defs(); BTF_KFUNCS_START(kprobe_multi_kfunc_set_ids) BTF_ID_FLAGS(func, bpf_session_is_return) BTF_ID_FLAGS(func, bpf_session_cookie) BTF_KFUNCS_END(kprobe_multi_kfunc_set_ids) static int bpf_kprobe_multi_filter(const struct bpf_prog *prog, u32 kfunc_id) { if (!btf_id_set8_contains(&kprobe_multi_kfunc_set_ids, kfunc_id)) return 0; if (!is_kprobe_session(prog) && !is_uprobe_session(prog)) return -EACCES; return 0; } static const struct btf_kfunc_id_set bpf_kprobe_multi_kfunc_set = { .owner = THIS_MODULE, .set = &kprobe_multi_kfunc_set_ids, .filter = bpf_kprobe_multi_filter, }; static int __init bpf_kprobe_multi_kfuncs_init(void) { return register_btf_kfunc_id_set(BPF_PROG_TYPE_KPROBE, &bpf_kprobe_multi_kfunc_set); } late_initcall(bpf_kprobe_multi_kfuncs_init); typedef int (*copy_fn_t)(void *dst, const void *src, u32 size, struct task_struct *tsk); /* * The __always_inline is to make sure the compiler doesn't * generate indirect calls into callbacks, which is expensive, * on some kernel configurations. This allows compiler to put * direct calls into all the specific callback implementations * (copy_user_data_sleepable, copy_user_data_nofault, and so on) */ static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u64 doff, u64 size, const void *unsafe_src, copy_fn_t str_copy_fn, struct task_struct *tsk) { struct bpf_dynptr_kern *dst; u64 chunk_sz, off; void *dst_slice; int cnt, err; char buf[256]; dst_slice = bpf_dynptr_slice_rdwr(dptr, doff, NULL, size); if (likely(dst_slice)) return str_copy_fn(dst_slice, unsafe_src, size, tsk); dst = (struct bpf_dynptr_kern *)dptr; if (bpf_dynptr_check_off_len(dst, doff, size)) return -E2BIG; for (off = 0; off < size; off += chunk_sz - 1) { chunk_sz = min_t(u64, sizeof(buf), size - off); /* Expect str_copy_fn to return count of copied bytes, including * zero terminator. Next iteration increment off by chunk_sz - 1 to * overwrite NUL. */ cnt = str_copy_fn(buf, unsafe_src + off, chunk_sz, tsk); if (cnt < 0) return cnt; err = __bpf_dynptr_write(dst, doff + off, buf, cnt, 0); if (err) return err; if (cnt < chunk_sz || chunk_sz == 1) /* we are done */ return off + cnt; } return off; } static __always_inline int __bpf_dynptr_copy(const struct bpf_dynptr *dptr, u64 doff, u64 size, const void *unsafe_src, copy_fn_t copy_fn, struct task_struct *tsk) { struct bpf_dynptr_kern *dst; void *dst_slice; char buf[256]; u64 off, chunk_sz; int err; dst_slice = bpf_dynptr_slice_rdwr(dptr, doff, NULL, size); if (likely(dst_slice)) return copy_fn(dst_slice, unsafe_src, size, tsk); dst = (struct bpf_dynptr_kern *)dptr; if (bpf_dynptr_check_off_len(dst, doff, size)) return -E2BIG; for (off = 0; off < size; off += chunk_sz) { chunk_sz = min_t(u64, sizeof(buf), size - off); err = copy_fn(buf, unsafe_src + off, chunk_sz, tsk); if (err) return err; err = __bpf_dynptr_write(dst, doff + off, buf, chunk_sz, 0); if (err) return err; } return 0; } static __always_inline int copy_user_data_nofault(void *dst, const void *unsafe_src, u32 size, struct task_struct *tsk) { return copy_from_user_nofault(dst, (const void __user *)unsafe_src, size); } static __always_inline int copy_user_data_sleepable(void *dst, const void *unsafe_src, u32 size, struct task_struct *tsk) { int ret; if (!tsk) { /* Read from the current task */ ret = copy_from_user(dst, (const void __user *)unsafe_src, size); if (ret) return -EFAULT; return 0; } ret = access_process_vm(tsk, (unsigned long)unsafe_src, dst, size, 0); if (ret != size) return -EFAULT; return 0; } static __always_inline int copy_kernel_data_nofault(void *dst, const void *unsafe_src, u32 size, struct task_struct *tsk) { return copy_from_kernel_nofault(dst, unsafe_src, size); } static __always_inline int copy_user_str_nofault(void *dst, const void *unsafe_src, u32 size, struct task_struct *tsk) { return strncpy_from_user_nofault(dst, (const void __user *)unsafe_src, size); } static __always_inline int copy_user_str_sleepable(void *dst, const void *unsafe_src, u32 size, struct task_struct *tsk) { int ret; if (unlikely(size == 0)) return 0; if (tsk) { ret = copy_remote_vm_str(tsk, (unsigned long)unsafe_src, dst, size, 0); } else { ret = strncpy_from_user(dst, (const void __user *)unsafe_src, size - 1); /* strncpy_from_user does not guarantee NUL termination */ if (ret >= 0) ((char *)dst)[ret] = '\0'; } if (ret < 0) return ret; return ret + 1; } static __always_inline int copy_kernel_str_nofault(void *dst, const void *unsafe_src, u32 size, struct task_struct *tsk) { return strncpy_from_kernel_nofault(dst, unsafe_src, size); } __bpf_kfunc_start_defs(); __bpf_kfunc int bpf_send_signal_task(struct task_struct *task, int sig, enum pid_type type, u64 value) { if (type != PIDTYPE_PID && type != PIDTYPE_TGID) return -EINVAL; return bpf_send_signal_common(sig, type, task, value); } __bpf_kfunc int bpf_probe_read_user_dynptr(struct bpf_dynptr *dptr, u64 off, u64 size, const void __user *unsafe_ptr__ign) { return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign, copy_user_data_nofault, NULL); } __bpf_kfunc int bpf_probe_read_kernel_dynptr(struct bpf_dynptr *dptr, u64 off, u64 size, const void *unsafe_ptr__ign) { return __bpf_dynptr_copy(dptr, off, size, unsafe_ptr__ign, copy_kernel_data_nofault, NULL); } __bpf_kfunc int bpf_probe_read_user_str_dynptr(struct bpf_dynptr *dptr, u64 off, u64 size, const void __user *unsafe_ptr__ign) { return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign, copy_user_str_nofault, NULL); } __bpf_kfunc int bpf_probe_read_kernel_str_dynptr(struct bpf_dynptr *dptr, u64 off, u64 size, const void *unsafe_ptr__ign) { return __bpf_dynptr_copy_str(dptr, off, size, unsafe_ptr__ign, copy_kernel_str_nofault, NULL); } __bpf_kfunc int bpf_copy_from_user_dynptr(struct bpf_dynptr *dptr, u64 off, u64 size, const void __user *unsafe_ptr__ign) { return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign, copy_user_data_sleepable, NULL); } __bpf_kfunc int bpf_copy_from_user_str_dynptr(struct bpf_dynptr *dptr, u64 off, u64 size, const void __user *unsafe_ptr__ign) { return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign, copy_user_str_sleepable, NULL); } __bpf_kfunc int bpf_copy_from_user_task_dynptr(struct bpf_dynptr *dptr, u64 off, u64 size, const void __user *unsafe_ptr__ign, struct task_struct *tsk) { return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign, copy_user_data_sleepable, tsk); } __bpf_kfunc int bpf_copy_from_user_task_str_dynptr(struct bpf_dynptr *dptr, u64 off, u64 size, const void __user *unsafe_ptr__ign, struct task_struct *tsk) { return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign, copy_user_str_sleepable, tsk); } __bpf_kfunc_end_defs();
9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 // SPDX-License-Identifier: GPL-2.0 /* * drivers/base/power/common.c - Common device power management code. * * Copyright (C) 2011 Rafael J. Wysocki <rjw@sisk.pl>, Renesas Electronics Corp. */ #include <linux/kernel.h> #include <linux/device.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/pm_clock.h> #include <linux/acpi.h> #include <linux/pm_domain.h> #include <linux/pm_opp.h> #include "power.h" /** * dev_pm_get_subsys_data - Create or refcount power.subsys_data for device. * @dev: Device to handle. * * If power.subsys_data is NULL, point it to a new object, otherwise increment * its reference counter. Return 0 if new object has been created or refcount * increased, otherwise negative error code. */ int dev_pm_get_subsys_data(struct device *dev) { struct pm_subsys_data *psd; psd = kzalloc(sizeof(*psd), GFP_KERNEL); if (!psd) return -ENOMEM; spin_lock_irq(&dev->power.lock); if (dev->power.subsys_data) { dev->power.subsys_data->refcount++; } else { spin_lock_init(&psd->lock); psd->refcount = 1; dev->power.subsys_data = psd; pm_clk_init(dev); psd = NULL; } spin_unlock_irq(&dev->power.lock); /* kfree() verifies that its argument is nonzero. */ kfree(psd); return 0; } EXPORT_SYMBOL_GPL(dev_pm_get_subsys_data); /** * dev_pm_put_subsys_data - Drop reference to power.subsys_data. * @dev: Device to handle. * * If the reference counter of power.subsys_data is zero after dropping the * reference, power.subsys_data is removed. */ void dev_pm_put_subsys_data(struct device *dev) { struct pm_subsys_data *psd; spin_lock_irq(&dev->power.lock); psd = dev_to_psd(dev); if (!psd) goto out; if (--psd->refcount == 0) dev->power.subsys_data = NULL; else psd = NULL; out: spin_unlock_irq(&dev->power.lock); kfree(psd); } EXPORT_SYMBOL_GPL(dev_pm_put_subsys_data); /** * dev_pm_domain_attach - Attach a device to its PM domain. * @dev: Device to attach. * @flags: indicate whether we should power on/off the device on attach/detach * * The @dev may only be attached to a single PM domain. By iterating through * the available alternatives we try to find a valid PM domain for the device. * As attachment succeeds, the ->detach() callback in the struct dev_pm_domain * should be assigned by the corresponding attach function. * * This function should typically be invoked from subsystem level code during * the probe phase. Especially for those that holds devices which requires * power management through PM domains. * * Callers must ensure proper synchronization of this function with power * management callbacks. * * Returns 0 on successfully attached PM domain, or when it is found that the * device doesn't need a PM domain, else a negative error code. */ int dev_pm_domain_attach(struct device *dev, u32 flags) { int ret; if (dev->pm_domain) return 0; ret = acpi_dev_pm_attach(dev, !!(flags & PD_FLAG_ATTACH_POWER_ON)); if (!ret) ret = genpd_dev_pm_attach(dev); if (dev->pm_domain) dev->power.detach_power_off = !!(flags & PD_FLAG_DETACH_POWER_OFF); return ret < 0 ? ret : 0; } EXPORT_SYMBOL_GPL(dev_pm_domain_attach); /** * dev_pm_domain_attach_by_id - Associate a device with one of its PM domains. * @dev: The device used to lookup the PM domain. * @index: The index of the PM domain. * * As @dev may only be attached to a single PM domain, the backend PM domain * provider creates a virtual device to attach instead. If attachment succeeds, * the ->detach() callback in the struct dev_pm_domain are assigned by the * corresponding backend attach function, as to deal with detaching of the * created virtual device. * * This function should typically be invoked by a driver during the probe phase, * in case its device requires power management through multiple PM domains. The * driver may benefit from using the received device, to configure device-links * towards its original device. Depending on the use-case and if needed, the * links may be dynamically changed by the driver, which allows it to control * the power to the PM domains independently from each other. * * Callers must ensure proper synchronization of this function with power * management callbacks. * * Returns the virtual created device when successfully attached to its PM * domain, NULL in case @dev don't need a PM domain, else an ERR_PTR(). * Note that, to detach the returned virtual device, the driver shall call * dev_pm_domain_detach() on it, typically during the remove phase. */ struct device *dev_pm_domain_attach_by_id(struct device *dev, unsigned int index) { if (dev->pm_domain) return ERR_PTR(-EEXIST); return genpd_dev_pm_attach_by_id(dev, index); } EXPORT_SYMBOL_GPL(dev_pm_domain_attach_by_id); /** * dev_pm_domain_attach_by_name - Associate a device with one of its PM domains. * @dev: The device used to lookup the PM domain. * @name: The name of the PM domain. * * For a detailed function description, see dev_pm_domain_attach_by_id(). */ struct device *dev_pm_domain_attach_by_name(struct device *dev, const char *name) { if (dev->pm_domain) return ERR_PTR(-EEXIST); return genpd_dev_pm_attach_by_name(dev, name); } EXPORT_SYMBOL_GPL(dev_pm_domain_attach_by_name); /** * dev_pm_domain_attach_list - Associate a device with its PM domains. * @dev: The device used to lookup the PM domains for. * @data: The data used for attaching to the PM domains. * @list: An out-parameter with an allocated list of attached PM domains. * * This function helps to attach a device to its multiple PM domains. The * caller, which is typically a driver's probe function, may provide a list of * names for the PM domains that we should try to attach the device to, but it * may also provide an empty list, in case the attach should be done for all of * the available PM domains. * * Callers must ensure proper synchronization of this function with power * management callbacks. * * Returns the number of attached PM domains or a negative error code in case of * a failure. Note that, to detach the list of PM domains, the driver shall call * dev_pm_domain_detach_list(), typically during the remove phase. */ int dev_pm_domain_attach_list(struct device *dev, const struct dev_pm_domain_attach_data *data, struct dev_pm_domain_list **list) { struct device_node *np = dev->of_node; struct dev_pm_domain_list *pds; struct device *pd_dev = NULL; int ret, i, num_pds = 0; bool by_id = true; size_t size; u32 pd_flags = data ? data->pd_flags : 0; u32 link_flags = pd_flags & PD_FLAG_NO_DEV_LINK ? 0 : DL_FLAG_STATELESS | DL_FLAG_PM_RUNTIME; if (dev->pm_domain) return -EEXIST; /* For now this is limited to OF based platforms. */ if (!np) return 0; if (data && data->pd_names) { num_pds = data->num_pd_names; by_id = false; } else { num_pds = of_count_phandle_with_args(np, "power-domains", "#power-domain-cells"); } if (num_pds <= 0) return 0; pds = kzalloc(sizeof(*pds), GFP_KERNEL); if (!pds) return -ENOMEM; size = sizeof(*pds->pd_devs) + sizeof(*pds->pd_links) + sizeof(*pds->opp_tokens); pds->pd_devs = kcalloc(num_pds, size, GFP_KERNEL); if (!pds->pd_devs) { ret = -ENOMEM; goto free_pds; } pds->pd_links = (void *)(pds->pd_devs + num_pds); pds->opp_tokens = (void *)(pds->pd_links + num_pds); if (link_flags && pd_flags & PD_FLAG_DEV_LINK_ON) link_flags |= DL_FLAG_RPM_ACTIVE; for (i = 0; i < num_pds; i++) { if (by_id) pd_dev = dev_pm_domain_attach_by_id(dev, i); else pd_dev = dev_pm_domain_attach_by_name(dev, data->pd_names[i]); if (IS_ERR_OR_NULL(pd_dev)) { ret = pd_dev ? PTR_ERR(pd_dev) : -ENODEV; goto err_attach; } if (pd_flags & PD_FLAG_REQUIRED_OPP) { struct dev_pm_opp_config config = { .required_dev = pd_dev, .required_dev_index = i, }; ret = dev_pm_opp_set_config(dev, &config); if (ret < 0) goto err_link; pds->opp_tokens[i] = ret; } if (link_flags) { struct device_link *link; link = device_link_add(dev, pd_dev, link_flags); if (!link) { ret = -ENODEV; goto err_link; } pds->pd_links[i] = link; } pds->pd_devs[i] = pd_dev; } pds->num_pds = num_pds; *list = pds; return num_pds; err_link: dev_pm_opp_clear_config(pds->opp_tokens[i]); dev_pm_domain_detach(pd_dev, true); err_attach: while (--i >= 0) { dev_pm_opp_clear_config(pds->opp_tokens[i]); if (pds->pd_links[i]) device_link_del(pds->pd_links[i]); dev_pm_domain_detach(pds->pd_devs[i], true); } kfree(pds->pd_devs); free_pds: kfree(pds); return ret; } EXPORT_SYMBOL_GPL(dev_pm_domain_attach_list); /** * devm_pm_domain_detach_list - devres-enabled version of dev_pm_domain_detach_list. * @_list: The list of PM domains to detach. * * This function reverse the actions from devm_pm_domain_attach_list(). * it will be invoked during the remove phase from drivers implicitly if driver * uses devm_pm_domain_attach_list() to attach the PM domains. */ static void devm_pm_domain_detach_list(void *_list) { struct dev_pm_domain_list *list = _list; dev_pm_domain_detach_list(list); } /** * devm_pm_domain_attach_list - devres-enabled version of dev_pm_domain_attach_list * @dev: The device used to lookup the PM domains for. * @data: The data used for attaching to the PM domains. * @list: An out-parameter with an allocated list of attached PM domains. * * NOTE: this will also handle calling devm_pm_domain_detach_list() for * you during remove phase. * * Returns the number of attached PM domains or a negative error code in case of * a failure. */ int devm_pm_domain_attach_list(struct device *dev, const struct dev_pm_domain_attach_data *data, struct dev_pm_domain_list **list) { int ret, num_pds; num_pds = dev_pm_domain_attach_list(dev, data, list); if (num_pds <= 0) return num_pds; ret = devm_add_action_or_reset(dev, devm_pm_domain_detach_list, *list); if (ret) return ret; return num_pds; } EXPORT_SYMBOL_GPL(devm_pm_domain_attach_list); /** * dev_pm_domain_detach - Detach a device from its PM domain. * @dev: Device to detach. * @power_off: Used to indicate whether we should power off the device. * * This functions will reverse the actions from dev_pm_domain_attach(), * dev_pm_domain_attach_by_id() and dev_pm_domain_attach_by_name(), thus it * detaches @dev from its PM domain. Typically it should be invoked during the * remove phase, either from subsystem level code or from drivers. * * Callers must ensure proper synchronization of this function with power * management callbacks. */ void dev_pm_domain_detach(struct device *dev, bool power_off) { if (dev->pm_domain && dev->pm_domain->detach) dev->pm_domain->detach(dev, power_off); } EXPORT_SYMBOL_GPL(dev_pm_domain_detach); /** * dev_pm_domain_detach_list - Detach a list of PM domains. * @list: The list of PM domains to detach. * * This function reverse the actions from dev_pm_domain_attach_list(). * Typically it should be invoked during the remove phase from drivers. * * Callers must ensure proper synchronization of this function with power * management callbacks. */ void dev_pm_domain_detach_list(struct dev_pm_domain_list *list) { int i; if (!list) return; for (i = 0; i < list->num_pds; i++) { dev_pm_opp_clear_config(list->opp_tokens[i]); if (list->pd_links[i]) device_link_del(list->pd_links[i]); dev_pm_domain_detach(list->pd_devs[i], true); } kfree(list->pd_devs); kfree(list); } EXPORT_SYMBOL_GPL(dev_pm_domain_detach_list); /** * dev_pm_domain_start - Start the device through its PM domain. * @dev: Device to start. * * This function should typically be called during probe by a subsystem/driver, * when it needs to start its device from the PM domain's perspective. Note * that, it's assumed that the PM domain is already powered on when this * function is called. * * Returns 0 on success and negative error values on failures. */ int dev_pm_domain_start(struct device *dev) { if (dev->pm_domain && dev->pm_domain->start) return dev->pm_domain->start(dev); return 0; } EXPORT_SYMBOL_GPL(dev_pm_domain_start); /** * dev_pm_domain_set - Set PM domain of a device. * @dev: Device whose PM domain is to be set. * @pd: PM domain to be set, or NULL. * * Sets the PM domain the device belongs to. The PM domain of a device needs * to be set before its probe finishes (it's bound to a driver). * * This function must be called with the device lock held. */ void dev_pm_domain_set(struct device *dev, struct dev_pm_domain *pd) { if (dev->pm_domain == pd) return; WARN(pd && device_is_bound(dev), "PM domains can only be changed for unbound devices\n"); dev->pm_domain = pd; device_pm_check_callbacks(dev); } EXPORT_SYMBOL_GPL(dev_pm_domain_set); /** * dev_pm_domain_set_performance_state - Request a new performance state. * @dev: The device to make the request for. * @state: Target performance state for the device. * * This function should be called when a new performance state needs to be * requested for a device that is attached to a PM domain. Note that, the * support for performance scaling for PM domains is optional. * * Returns 0 on success and when performance scaling isn't supported, negative * error code on failure. */ int dev_pm_domain_set_performance_state(struct device *dev, unsigned int state) { if (dev->pm_domain && dev->pm_domain->set_performance_state) return dev->pm_domain->set_performance_state(dev, state); return 0; } EXPORT_SYMBOL_GPL(dev_pm_domain_set_performance_state);
4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 // SPDX-License-Identifier: GPL-2.0-only /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Implementation of the Transmission Control Protocol(TCP). * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> * Corey Minyard <wf-rch!minyard@relay.EU.net> * Florian La Roche, <flla@stud.uni-sb.de> * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> * Linus Torvalds, <torvalds@cs.helsinki.fi> * Alan Cox, <gw4pts@gw4pts.ampr.org> * Matthew Dillon, <dillon@apollo.west.oic.com> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Jorge Cwik, <jorge@laser.satlink.net> */ #include <net/tcp.h> #include <net/tcp_ecn.h> #include <net/xfrm.h> #include <net/busy_poll.h> #include <net/rstreason.h> #include <net/psp.h> static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) { if (seq == s_win) return true; if (after(end_seq, s_win) && before(seq, e_win)) return true; return seq == e_win && seq == end_seq; } static enum tcp_tw_status tcp_timewait_check_oow_rate_limit(struct inet_timewait_sock *tw, const struct sk_buff *skb, int mib_idx) { struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); if (!tcp_oow_rate_limited(twsk_net(tw), skb, mib_idx, &tcptw->tw_last_oow_ack_time)) { /* Send ACK. Note, we do not put the bucket, * it will be released by caller. */ return TCP_TW_ACK_OOW; } /* We are rate-limiting, so just release the tw sock and drop skb. */ inet_twsk_put(tw); return TCP_TW_SUCCESS; } static void twsk_rcv_nxt_update(struct tcp_timewait_sock *tcptw, u32 seq, u32 rcv_nxt) { #ifdef CONFIG_TCP_AO struct tcp_ao_info *ao; ao = rcu_dereference(tcptw->ao_info); if (unlikely(ao && seq < rcv_nxt)) WRITE_ONCE(ao->rcv_sne, ao->rcv_sne + 1); #endif WRITE_ONCE(tcptw->tw_rcv_nxt, seq); } /* * * Main purpose of TIME-WAIT state is to close connection gracefully, * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN * (and, probably, tail of data) and one or more our ACKs are lost. * * What is TIME-WAIT timeout? It is associated with maximal packet * lifetime in the internet, which results in wrong conclusion, that * it is set to catch "old duplicate segments" wandering out of their path. * It is not quite correct. This timeout is calculated so that it exceeds * maximal retransmission timeout enough to allow to lose one (or more) * segments sent by peer and our ACKs. This time may be calculated from RTO. * * When TIME-WAIT socket receives RST, it means that another end * finally closed and we are allowed to kill TIME-WAIT too. * * Second purpose of TIME-WAIT is catching old duplicate segments. * Well, certainly it is pure paranoia, but if we load TIME-WAIT * with this semantics, we MUST NOT kill TIME-WAIT state with RSTs. * * If we invented some more clever way to catch duplicates * (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs. * * The algorithm below is based on FORMAL INTERPRETATION of RFCs. * When you compare it to RFCs, please, read section SEGMENT ARRIVES * from the very beginning. * * NOTE. With recycling (and later with fin-wait-2) TW bucket * is _not_ stateless. It means, that strictly speaking we must * spinlock it. I do not want! Well, probability of misbehaviour * is ridiculously low and, seems, we could use some mb() tricks * to avoid misread sequence numbers, states etc. --ANK * * We don't need to initialize tmp_out.sack_ok as we don't use the results */ enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, const struct tcphdr *th, u32 *tw_isn, enum skb_drop_reason *drop_reason) { struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); u32 rcv_nxt = READ_ONCE(tcptw->tw_rcv_nxt); struct tcp_options_received tmp_opt; enum skb_drop_reason psp_drop; bool paws_reject = false; int ts_recent_stamp; /* Instead of dropping immediately, wait to see what value is * returned. We will accept a non psp-encapsulated syn in the * case where TCP_TW_SYN is returned. */ psp_drop = psp_twsk_rx_policy_check(tw, skb); tmp_opt.saw_tstamp = 0; ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp); if (th->doff > (sizeof(*th) >> 2) && ts_recent_stamp) { tcp_parse_options(twsk_net(tw), skb, &tmp_opt, 0, NULL); if (tmp_opt.saw_tstamp) { if (tmp_opt.rcv_tsecr) tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset; tmp_opt.ts_recent = READ_ONCE(tcptw->tw_ts_recent); tmp_opt.ts_recent_stamp = ts_recent_stamp; paws_reject = tcp_paws_reject(&tmp_opt, th->rst); } } if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2) { /* Just repeat all the checks of tcp_rcv_state_process() */ if (psp_drop) goto out_put; /* Out of window, send ACK */ if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, rcv_nxt, rcv_nxt + tcptw->tw_rcv_wnd)) return tcp_timewait_check_oow_rate_limit( tw, skb, LINUX_MIB_TCPACKSKIPPEDFINWAIT2); if (th->rst) goto kill; if (th->syn && !before(TCP_SKB_CB(skb)->seq, rcv_nxt)) return TCP_TW_RST; /* Dup ACK? */ if (!th->ack || !after(TCP_SKB_CB(skb)->end_seq, rcv_nxt) || TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) { inet_twsk_put(tw); return TCP_TW_SUCCESS; } /* New data or FIN. If new data arrive after half-duplex close, * reset. */ if (!th->fin || TCP_SKB_CB(skb)->end_seq != rcv_nxt + 1) return TCP_TW_RST; /* FIN arrived, enter true time-wait state. */ WRITE_ONCE(tw->tw_substate, TCP_TIME_WAIT); twsk_rcv_nxt_update(tcptw, TCP_SKB_CB(skb)->end_seq, rcv_nxt); if (tmp_opt.saw_tstamp) { u64 ts = tcp_clock_ms(); WRITE_ONCE(tw->tw_entry_stamp, ts); WRITE_ONCE(tcptw->tw_ts_recent_stamp, div_u64(ts, MSEC_PER_SEC)); WRITE_ONCE(tcptw->tw_ts_recent, tmp_opt.rcv_tsval); } inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); return TCP_TW_ACK; } /* * Now real TIME-WAIT state. * * RFC 1122: * "When a connection is [...] on TIME-WAIT state [...] * [a TCP] MAY accept a new SYN from the remote TCP to * reopen the connection directly, if it: * * (1) assigns its initial sequence number for the new * connection to be larger than the largest sequence * number it used on the previous connection incarnation, * and * * (2) returns to TIME-WAIT state if the SYN turns out * to be an old duplicate". */ if (!paws_reject && (TCP_SKB_CB(skb)->seq == rcv_nxt && (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) { /* In window segment, it may be only reset or bare ack. */ if (psp_drop) goto out_put; if (th->rst) { /* This is TIME_WAIT assassination, in two flavors. * Oh well... nobody has a sufficient solution to this * protocol bug yet. */ if (!READ_ONCE(twsk_net(tw)->ipv4.sysctl_tcp_rfc1337)) { kill: inet_twsk_deschedule_put(tw); return TCP_TW_SUCCESS; } } else { inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); } if (tmp_opt.saw_tstamp) { WRITE_ONCE(tcptw->tw_ts_recent, tmp_opt.rcv_tsval); WRITE_ONCE(tcptw->tw_ts_recent_stamp, ktime_get_seconds()); } inet_twsk_put(tw); return TCP_TW_SUCCESS; } /* Out of window segment. All the segments are ACKed immediately. The only exception is new SYN. We accept it, if it is not old duplicate and we are not in danger to be killed by delayed old duplicates. RFC check is that it has newer sequence number works at rates <40Mbit/sec. However, if paws works, it is reliable AND even more, we even may relax silly seq space cutoff. RED-PEN: we violate main RFC requirement, if this SYN will appear old duplicate (i.e. we receive RST in reply to SYN-ACK), we must return socket to time-wait state. It is not good, but not fatal yet. */ if (th->syn && !th->rst && !th->ack && !paws_reject && (after(TCP_SKB_CB(skb)->seq, rcv_nxt) || (tmp_opt.saw_tstamp && (s32)(READ_ONCE(tcptw->tw_ts_recent) - tmp_opt.rcv_tsval) < 0))) { u32 isn = tcptw->tw_snd_nxt + 65535 + 2; if (isn == 0) isn++; *tw_isn = isn; return TCP_TW_SYN; } if (psp_drop) goto out_put; if (paws_reject) { *drop_reason = SKB_DROP_REASON_TCP_RFC7323_TW_PAWS; __NET_INC_STATS(twsk_net(tw), LINUX_MIB_PAWS_TW_REJECTED); } if (!th->rst) { /* In this case we must reset the TIMEWAIT timer. * * If it is ACKless SYN it may be both old duplicate * and new good SYN with random sequence number <rcv_nxt. * Do not reschedule in the last case. */ if (paws_reject || th->ack) inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); return tcp_timewait_check_oow_rate_limit( tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT); } out_put: inet_twsk_put(tw); return TCP_TW_SUCCESS; } EXPORT_IPV6_MOD(tcp_timewait_state_process); static void tcp_time_wait_init(struct sock *sk, struct tcp_timewait_sock *tcptw) { #ifdef CONFIG_TCP_MD5SIG const struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_key *key; /* * The timewait bucket does not have the key DB from the * sock structure. We just make a quick copy of the * md5 key being used (if indeed we are using one) * so the timewait ack generating code has the key. */ tcptw->tw_md5_key = NULL; if (!static_branch_unlikely(&tcp_md5_needed.key)) return; key = tp->af_specific->md5_lookup(sk, sk); if (key) { tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC); if (!tcptw->tw_md5_key) return; if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) goto out_free; } return; out_free: WARN_ON_ONCE(1); kfree(tcptw->tw_md5_key); tcptw->tw_md5_key = NULL; #endif } /* * Move a socket to time-wait or dead fin-wait-2 state. */ void tcp_time_wait(struct sock *sk, int state, int timeo) { const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); struct inet_timewait_sock *tw; tw = inet_twsk_alloc(sk, &net->ipv4.tcp_death_row, state); if (tw) { struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); tw->tw_mark = sk->sk_mark; tw->tw_priority = READ_ONCE(sk->sk_priority); tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; /* refreshed when we enter true TIME-WAIT state */ tw->tw_entry_stamp = tcp_time_stamp_ms(tp); tcptw->tw_rcv_nxt = tp->rcv_nxt; tcptw->tw_snd_nxt = tp->snd_nxt; tcptw->tw_rcv_wnd = tcp_receive_window(tp); tcptw->tw_ts_recent = tp->rx_opt.ts_recent; tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; tcptw->tw_ts_offset = tp->tsoffset; tw->tw_usec_ts = tp->tcp_usec_ts; tcptw->tw_last_oow_ack_time = 0; tcptw->tw_tx_delay = tp->tcp_tx_delay; tw->tw_txhash = sk->sk_txhash; tw->tw_tx_queue_mapping = sk->sk_tx_queue_mapping; #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING tw->tw_rx_queue_mapping = sk->sk_rx_queue_mapping; #endif #if IS_ENABLED(CONFIG_IPV6) if (tw->tw_family == PF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); tw->tw_v6_daddr = sk->sk_v6_daddr; tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr; tw->tw_tclass = np->tclass; tw->tw_flowlabel = be32_to_cpu(np->flow_label & IPV6_FLOWLABEL_MASK); tw->tw_ipv6only = sk->sk_ipv6only; } #endif tcp_time_wait_init(sk, tcptw); tcp_ao_time_wait(tcptw, tp); /* Get the TIME_WAIT timeout firing. */ if (timeo < rto) timeo = rto; if (state == TCP_TIME_WAIT) timeo = TCP_TIMEWAIT_LEN; /* Linkage updates. * Note that access to tw after this point is illegal. */ inet_twsk_hashdance_schedule(tw, sk, net->ipv4.tcp_death_row.hashinfo, timeo); } else { /* Sorry, if we're out of memory, just CLOSE this * socket up. We've got bigger problems than * non-graceful socket closings. */ NET_INC_STATS(net, LINUX_MIB_TCPTIMEWAITOVERFLOW); } tcp_update_metrics(sk); tcp_done(sk); } EXPORT_SYMBOL(tcp_time_wait); void tcp_twsk_destructor(struct sock *sk) { #ifdef CONFIG_TCP_MD5SIG if (static_branch_unlikely(&tcp_md5_needed.key)) { struct tcp_timewait_sock *twsk = tcp_twsk(sk); if (twsk->tw_md5_key) { kfree(twsk->tw_md5_key); static_branch_slow_dec_deferred(&tcp_md5_needed); } } #endif tcp_ao_destroy_sock(sk, true); psp_twsk_assoc_free(inet_twsk(sk)); } void tcp_twsk_purge(struct list_head *net_exit_list) { bool purged_once = false; struct net *net; list_for_each_entry(net, net_exit_list, exit_list) { if (net->ipv4.tcp_death_row.hashinfo->pernet) { /* Even if tw_refcount == 1, we must clean up kernel reqsk */ inet_twsk_purge(net->ipv4.tcp_death_row.hashinfo); } else if (!purged_once) { inet_twsk_purge(&tcp_hashinfo); purged_once = true; } } } /* Warning : This function is called without sk_listener being locked. * Be sure to read socket fields once, as their value could change under us. */ void tcp_openreq_init_rwin(struct request_sock *req, const struct sock *sk_listener, const struct dst_entry *dst) { struct inet_request_sock *ireq = inet_rsk(req); const struct tcp_sock *tp = tcp_sk(sk_listener); int full_space = tcp_full_space(sk_listener); u32 window_clamp; __u8 rcv_wscale; u32 rcv_wnd; int mss; mss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); window_clamp = READ_ONCE(tp->window_clamp); /* Set this up on the first call only */ req->rsk_window_clamp = window_clamp ? : dst_metric(dst, RTAX_WINDOW); /* limit the window selection if the user enforce a smaller rx buffer */ if (sk_listener->sk_userlocks & SOCK_RCVBUF_LOCK && (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0)) req->rsk_window_clamp = full_space; rcv_wnd = tcp_rwnd_init_bpf((struct sock *)req); if (rcv_wnd == 0) rcv_wnd = dst_metric(dst, RTAX_INITRWND); else if (full_space < rcv_wnd * mss) full_space = rcv_wnd * mss; /* tcp_full_space because it is guaranteed to be the first packet */ tcp_select_initial_window(sk_listener, full_space, mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), &req->rsk_rcv_wnd, &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, rcv_wnd); ireq->rcv_wscale = rcv_wscale; } static void tcp_ecn_openreq_child(struct sock *sk, const struct request_sock *req, const struct sk_buff *skb) { const struct tcp_request_sock *treq = tcp_rsk(req); struct tcp_sock *tp = tcp_sk(sk); if (treq->accecn_ok) { tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); tp->syn_ect_snt = treq->syn_ect_snt; tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt); tp->saw_accecn_opt = treq->saw_accecn_opt; tp->prev_ecnfield = treq->syn_ect_rcv; tp->accecn_opt_demand = 1; tcp_ecn_received_counters_payload(sk, skb); } else { tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ? TCP_ECN_MODE_RFC3168 : TCP_ECN_DISABLED); } } void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) { struct inet_connection_sock *icsk = inet_csk(sk); u32 ca_key = dst_metric(dst, RTAX_CC_ALGO); bool ca_got_dst = false; if (ca_key != TCP_CA_UNSPEC) { const struct tcp_congestion_ops *ca; rcu_read_lock(); ca = tcp_ca_find_key(ca_key); if (likely(ca && bpf_try_module_get(ca, ca->owner))) { icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst); icsk->icsk_ca_ops = ca; ca_got_dst = true; } rcu_read_unlock(); } /* If no valid choice made yet, assign current system default ca. */ if (!ca_got_dst && (!icsk->icsk_ca_setsockopt || !bpf_try_module_get(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner))) tcp_assign_congestion_control(sk); tcp_set_ca_state(sk, TCP_CA_Open); } EXPORT_IPV6_MOD_GPL(tcp_ca_openreq_child); static void smc_check_reset_syn_req(const struct tcp_sock *oldtp, struct request_sock *req, struct tcp_sock *newtp) { #if IS_ENABLED(CONFIG_SMC) struct inet_request_sock *ireq; if (static_branch_unlikely(&tcp_have_smc)) { ireq = inet_rsk(req); if (oldtp->syn_smc && !ireq->smc_ok) newtp->syn_smc = 0; } #endif } /* This is not only more efficient than what we used to do, it eliminates * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM * * Actually, we could lots of memory writes here. tp of listening * socket contains all necessary default parameters. */ struct sock *tcp_create_openreq_child(const struct sock *sk, struct request_sock *req, struct sk_buff *skb) { struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC); const struct inet_request_sock *ireq = inet_rsk(req); struct tcp_request_sock *treq = tcp_rsk(req); struct inet_connection_sock *newicsk; const struct tcp_sock *oldtp; struct tcp_sock *newtp; u32 seq; if (!newsk) return NULL; newicsk = inet_csk(newsk); newtp = tcp_sk(newsk); oldtp = tcp_sk(sk); smc_check_reset_syn_req(oldtp, req, newtp); /* Now setup tcp_sock */ newtp->pred_flags = 0; seq = treq->rcv_isn + 1; newtp->rcv_wup = seq; WRITE_ONCE(newtp->copied_seq, seq); WRITE_ONCE(newtp->rcv_nxt, seq); newtp->segs_in = 1; seq = treq->snt_isn + 1; newtp->snd_sml = newtp->snd_una = seq; WRITE_ONCE(newtp->snd_nxt, seq); newtp->snd_up = seq; INIT_LIST_HEAD(&newtp->tsq_node); INIT_LIST_HEAD(&newtp->tsorted_sent_queue); tcp_init_wl(newtp, treq->rcv_isn); minmax_reset(&newtp->rtt_min, tcp_jiffies32, ~0U); newicsk->icsk_ack.lrcvtime = tcp_jiffies32; newtp->lsndtime = tcp_jiffies32; newsk->sk_txhash = READ_ONCE(treq->txhash); newtp->total_retrans = req->num_retrans; tcp_init_xmit_timers(newsk); WRITE_ONCE(newtp->write_seq, newtp->pushed_seq = treq->snt_isn + 1); if (sock_flag(newsk, SOCK_KEEPOPEN)) tcp_reset_keepalive_timer(newsk, keepalive_time_when(newtp)); newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; newtp->rx_opt.sack_ok = ireq->sack_ok; newtp->window_clamp = req->rsk_window_clamp; newtp->rcv_ssthresh = req->rsk_rcv_wnd; newtp->rcv_wnd = req->rsk_rcv_wnd; newtp->rx_opt.wscale_ok = ireq->wscale_ok; if (newtp->rx_opt.wscale_ok) { newtp->rx_opt.snd_wscale = ireq->snd_wscale; newtp->rx_opt.rcv_wscale = ireq->rcv_wscale; } else { newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0; newtp->window_clamp = min(newtp->window_clamp, 65535U); } newtp->snd_wnd = ntohs(tcp_hdr(skb)->window) << newtp->rx_opt.snd_wscale; newtp->max_window = newtp->snd_wnd; if (newtp->rx_opt.tstamp_ok) { newtp->tcp_usec_ts = treq->req_usec_ts; newtp->rx_opt.ts_recent = req->ts_recent; newtp->rx_opt.ts_recent_stamp = ktime_get_seconds(); newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; } else { newtp->tcp_usec_ts = 0; newtp->rx_opt.ts_recent_stamp = 0; newtp->tcp_header_len = sizeof(struct tcphdr); } if (req->num_timeout) { newtp->total_rto = req->num_timeout; newtp->undo_marker = treq->snt_isn; if (newtp->tcp_usec_ts) { newtp->retrans_stamp = treq->snt_synack; newtp->total_rto_time = (u32)(tcp_clock_us() - newtp->retrans_stamp) / USEC_PER_MSEC; } else { newtp->retrans_stamp = div_u64(treq->snt_synack, USEC_PER_SEC / TCP_TS_HZ); newtp->total_rto_time = tcp_clock_ms() - newtp->retrans_stamp; } newtp->total_rto_recoveries = 1; } newtp->tsoffset = treq->ts_off; #ifdef CONFIG_TCP_MD5SIG newtp->md5sig_info = NULL; /*XXX*/ #endif #ifdef CONFIG_TCP_AO newtp->ao_info = NULL; if (tcp_rsk_used_ao(req)) { struct tcp_ao_key *ao_key; ao_key = treq->af_specific->ao_lookup(sk, req, tcp_rsk(req)->ao_keyid, -1); if (ao_key) newtp->tcp_header_len += tcp_ao_len_aligned(ao_key); } #endif if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len) newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; newtp->rx_opt.mss_clamp = req->mss; tcp_ecn_openreq_child(newsk, req, skb); newtp->fastopen_req = NULL; RCU_INIT_POINTER(newtp->fastopen_rsk, NULL); newtp->bpf_chg_cc_inprogress = 0; tcp_bpf_clone(sk, newsk); __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); xa_init_flags(&newsk->sk_user_frags, XA_FLAGS_ALLOC1); return newsk; } EXPORT_SYMBOL(tcp_create_openreq_child); /* * Process an incoming packet for SYN_RECV sockets represented as a * request_sock. Normally sk is the listener socket but for TFO it * points to the child socket. * * XXX (TFO) - The current impl contains a special check for ack * validation and inside tcp_v4_reqsk_send_ack(). Can we do better? * * We don't need to initialize tmp_opt.sack_ok as we don't use the results * * Note: If @fastopen is true, this can be called from process context. * Otherwise, this is from BH context. */ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, struct request_sock *req, bool fastopen, bool *req_stolen, enum skb_drop_reason *drop_reason) { struct tcp_options_received tmp_opt; struct sock *child; const struct tcphdr *th = tcp_hdr(skb); __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); bool tsecr_reject = false; bool paws_reject = false; bool own_req; tmp_opt.saw_tstamp = 0; tmp_opt.accecn = 0; if (th->doff > (sizeof(struct tcphdr)>>2)) { tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL); if (tmp_opt.saw_tstamp) { tmp_opt.ts_recent = req->ts_recent; if (tmp_opt.rcv_tsecr) { if (inet_rsk(req)->tstamp_ok && !fastopen) tsecr_reject = !between(tmp_opt.rcv_tsecr, tcp_rsk(req)->snt_tsval_first, READ_ONCE(tcp_rsk(req)->snt_tsval_last)); tmp_opt.rcv_tsecr -= tcp_rsk(req)->ts_off; } /* We do not store true stamp, but it is not required, * it can be estimated (approximately) * from another data. */ tmp_opt.ts_recent_stamp = ktime_get_seconds() - tcp_reqsk_timeout(req) / HZ; paws_reject = tcp_paws_reject(&tmp_opt, th->rst); } } /* Check for pure retransmitted SYN. */ if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn && flg == TCP_FLAG_SYN && !paws_reject) { /* * RFC793 draws (Incorrectly! It was fixed in RFC1122) * this case on figure 6 and figure 8, but formal * protocol description says NOTHING. * To be more exact, it says that we should send ACK, * because this segment (at least, if it has no data) * is out of window. * * CONCLUSION: RFC793 (even with RFC1122) DOES NOT * describe SYN-RECV state. All the description * is wrong, we cannot believe to it and should * rely only on common sense and implementation * experience. * * Enforce "SYN-ACK" according to figure 8, figure 6 * of RFC793, fixed by RFC1122. * * Note that even if there is new data in the SYN packet * they will be thrown away too. * * Reset timer after retransmitting SYNACK, similar to * the idea of fast retransmit in recovery. */ if (!tcp_oow_rate_limited(sock_net(sk), skb, LINUX_MIB_TCPACKSKIPPEDSYNRECV, &tcp_rsk(req)->last_oow_ack_time) && !tcp_rtx_synack(sk, req)) { unsigned long expires = jiffies; expires += tcp_reqsk_timeout(req); if (!fastopen) mod_timer_pending(&req->rsk_timer, expires); else req->rsk_timer.expires = expires; } return NULL; } /* Further reproduces section "SEGMENT ARRIVES" for state SYN-RECEIVED of RFC793. It is broken, however, it does not work only when SYNs are crossed. You would think that SYN crossing is impossible here, since we should have a SYN_SENT socket (from connect()) on our end, but this is not true if the crossed SYNs were sent to both ends by a malicious third party. We must defend against this, and to do that we first verify the ACK (as per RFC793, page 36) and reset if it is invalid. Is this a true full defense? To convince ourselves, let us consider a way in which the ACK test can still pass in this 'malicious crossed SYNs' case. Malicious sender sends identical SYNs (and thus identical sequence numbers) to both A and B: A: gets SYN, seq=7 B: gets SYN, seq=7 By our good fortune, both A and B select the same initial send sequence number of seven :-) A: sends SYN|ACK, seq=7, ack_seq=8 B: sends SYN|ACK, seq=7, ack_seq=8 So we are now A eating this SYN|ACK, ACK test passes. So does sequence test, SYN is truncated, and thus we consider it a bare ACK. If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this bare ACK. Otherwise, we create an established connection. Both ends (listening sockets) accept the new incoming connection and try to talk to each other. 8-) Note: This case is both harmless, and rare. Possibility is about the same as us discovering intelligent life on another plant tomorrow. But generally, we should (RFC lies!) to accept ACK from SYNACK both here and in tcp_rcv_state_process(). tcp_rcv_state_process() does not, hence, we do not too. Note that the case is absolutely generic: we cannot optimize anything here without violating protocol. All the checks must be made before attempt to create socket. */ /* RFC793 page 36: "If the connection is in any non-synchronized state ... * and the incoming segment acknowledges something not yet * sent (the segment carries an unacceptable ACK) ... * a reset is sent." * * Invalid ACK: reset will be sent by listening socket. * Note that the ACK validity check for a Fast Open socket is done * elsewhere and is checked directly against the child socket rather * than req because user data may have been sent out. */ if ((flg & TCP_FLAG_ACK) && !fastopen && (TCP_SKB_CB(skb)->ack_seq != tcp_rsk(req)->snt_isn + 1)) return sk; /* RFC793: "first check sequence number". */ if (paws_reject || tsecr_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + tcp_synack_window(req))) { /* Out of window: send ACK and drop. */ if (!(flg & TCP_FLAG_RST) && !tcp_oow_rate_limited(sock_net(sk), skb, LINUX_MIB_TCPACKSKIPPEDSYNRECV, &tcp_rsk(req)->last_oow_ack_time)) req->rsk_ops->send_ack(sk, skb, req); if (paws_reject) { SKB_DR_SET(*drop_reason, TCP_RFC7323_PAWS); NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); } else if (tsecr_reject) { SKB_DR_SET(*drop_reason, TCP_RFC7323_TSECR); NET_INC_STATS(sock_net(sk), LINUX_MIB_TSECRREJECTED); } else { SKB_DR_SET(*drop_reason, TCP_OVERWINDOW); } return NULL; } /* In sequence, PAWS is OK. */ if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) { /* Truncate SYN, it is out of window starting at tcp_rsk(req)->rcv_isn + 1. */ flg &= ~TCP_FLAG_SYN; } /* RFC793: "second check the RST bit" and * "fourth, check the SYN bit" */ if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) { TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS); goto embryonic_reset; } /* ACK sequence verified above, just make sure ACK is * set. If ACK not set, just silently drop the packet. * * XXX (TFO) - if we ever allow "data after SYN", the * following check needs to be removed. */ if (!(flg & TCP_FLAG_ACK)) return NULL; if (tcp_rsk(req)->accecn_ok && tmp_opt.accecn && tcp_rsk(req)->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { u8 saw_opt = tcp_accecn_option_init(skb, tmp_opt.accecn); tcp_rsk(req)->saw_accecn_opt = saw_opt; if (tcp_rsk(req)->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN) { u8 fail_mode = TCP_ACCECN_OPT_FAIL_RECV; tcp_rsk(req)->accecn_fail_mode |= fail_mode; } } /* For Fast Open no more processing is needed (sk is the * child socket). */ if (fastopen) return sk; /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */ if (req->num_timeout < READ_ONCE(inet_csk(sk)->icsk_accept_queue.rskq_defer_accept) && TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { inet_rsk(req)->acked = 1; __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP); return NULL; } /* OK, ACK is valid, create big socket and * feed this segment to it. It will repeat all * the tests. THIS SEGMENT MUST MOVE SOCKET TO * ESTABLISHED STATE. If it will be dropped after * socket is created, wait for troubles. */ child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL, req, &own_req); if (!child) goto listen_overflow; if (own_req && tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt)) tcp_sk(child)->rx_opt.ts_recent = tmp_opt.rcv_tsval; if (own_req && rsk_drop_req(req)) { reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req); inet_csk_reqsk_queue_drop_and_put(req->rsk_listener, req); return child; } sock_rps_save_rxhash(child, skb); tcp_synack_rtt_meas(child, req); *req_stolen = !own_req; return inet_csk_complete_hashdance(sk, child, req, own_req); listen_overflow: SKB_DR_SET(*drop_reason, TCP_LISTEN_OVERFLOW); if (sk != req->rsk_listener) __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE); if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_abort_on_overflow)) { inet_rsk(req)->acked = 1; return NULL; } embryonic_reset: if (!(flg & TCP_FLAG_RST)) { /* Received a bad SYN pkt - for TFO We try not to reset * the local connection unless it's really necessary to * avoid becoming vulnerable to outside attack aiming at * resetting legit local connections. */ req->rsk_ops->send_reset(sk, skb, SK_RST_REASON_INVALID_SYN); } else if (fastopen) { /* received a valid RST pkt */ reqsk_fastopen_remove(sk, req, true); tcp_reset(sk, skb); } if (!fastopen) { bool unlinked = inet_csk_reqsk_queue_drop(sk, req); if (unlinked) __NET_INC_STATS(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); *req_stolen = !unlinked; } return NULL; } EXPORT_IPV6_MOD(tcp_check_req); /* * Queue segment on the new socket if the new socket is active, * otherwise we just shortcircuit this and continue with * the new socket. * * For the vast majority of cases child->sk_state will be TCP_SYN_RECV * when entering. But other states are possible due to a race condition * where after __inet_lookup_established() fails but before the listener * locked is obtained, other packets cause the same connection to * be created. */ enum skb_drop_reason tcp_child_process(struct sock *parent, struct sock *child, struct sk_buff *skb) __releases(&((child)->sk_lock.slock)) { enum skb_drop_reason reason = SKB_NOT_DROPPED_YET; int state = child->sk_state; /* record sk_napi_id and sk_rx_queue_mapping of child. */ sk_mark_napi_id_set(child, skb); tcp_segs_in(tcp_sk(child), skb); if (!sock_owned_by_user(child)) { reason = tcp_rcv_state_process(child, skb); /* Wakeup parent, send SIGIO */ if (state == TCP_SYN_RECV && child->sk_state != state) parent->sk_data_ready(parent); } else { /* Alas, it is possible again, because we do lookup * in main socket hash table and lock on listening * socket does not protect us more. */ __sk_add_backlog(child, skb); } bh_unlock_sock(child); sock_put(child); return reason; } EXPORT_IPV6_MOD(tcp_child_process);
7 7 7 6 11 11 7 7 7 7 18 18 18 11 7 18 7 11 18 11 11 11 11 11 11 6 11 13 13 1 1 1 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 7 7 7 7 7 6 6 6 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 1 7 7 7 7 7 7 7 7 7 7 7 6 6 7 7 7 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/inode.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * from * * linux/fs/minix/inode.c * * Copyright (C) 1991, 1992 Linus Torvalds * * 64-bit file support on 64-bit platforms by Jakub Jelinek * (jj@sunsite.ms.mff.cuni.cz) * * Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000 */ #include <linux/fs.h> #include <linux/mount.h> #include <linux/time.h> #include <linux/highuid.h> #include <linux/pagemap.h> #include <linux/dax.h> #include <linux/quotaops.h> #include <linux/string.h> #include <linux/buffer_head.h> #include <linux/writeback.h> #include <linux/pagevec.h> #include <linux/mpage.h> #include <linux/rmap.h> #include <linux/namei.h> #include <linux/uio.h> #include <linux/bio.h> #include <linux/workqueue.h> #include <linux/kernel.h> #include <linux/printk.h> #include <linux/slab.h> #include <linux/bitops.h> #include <linux/iomap.h> #include <linux/iversion.h> #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" #include "truncate.h" #include <trace/events/ext4.h> static void ext4_journalled_zero_new_buffers(handle_t *handle, struct inode *inode, struct folio *folio, unsigned from, unsigned to); static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei) { __u32 csum; __u16 dummy_csum = 0; int offset = offsetof(struct ext4_inode, i_checksum_lo); unsigned int csum_size = sizeof(dummy_csum); csum = ext4_chksum(ei->i_csum_seed, (__u8 *)raw, offset); csum = ext4_chksum(csum, (__u8 *)&dummy_csum, csum_size); offset += csum_size; csum = ext4_chksum(csum, (__u8 *)raw + offset, EXT4_GOOD_OLD_INODE_SIZE - offset); if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { offset = offsetof(struct ext4_inode, i_checksum_hi); csum = ext4_chksum(csum, (__u8 *)raw + EXT4_GOOD_OLD_INODE_SIZE, offset - EXT4_GOOD_OLD_INODE_SIZE); if (EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) { csum = ext4_chksum(csum, (__u8 *)&dummy_csum, csum_size); offset += csum_size; } csum = ext4_chksum(csum, (__u8 *)raw + offset, EXT4_INODE_SIZE(inode->i_sb) - offset); } return csum; } static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei) { __u32 provided, calculated; if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != cpu_to_le32(EXT4_OS_LINUX) || !ext4_has_feature_metadata_csum(inode->i_sb)) return 1; provided = le16_to_cpu(raw->i_checksum_lo); calculated = ext4_inode_csum(inode, raw, ei); if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) provided |= ((__u32)le16_to_cpu(raw->i_checksum_hi)) << 16; else calculated &= 0xFFFF; return provided == calculated; } void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei) { __u32 csum; if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != cpu_to_le32(EXT4_OS_LINUX) || !ext4_has_feature_metadata_csum(inode->i_sb)) return; csum = ext4_inode_csum(inode, raw, ei); raw->i_checksum_lo = cpu_to_le16(csum & 0xFFFF); if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) raw->i_checksum_hi = cpu_to_le16(csum >> 16); } static inline int ext4_begin_ordered_truncate(struct inode *inode, loff_t new_size) { trace_ext4_begin_ordered_truncate(inode, new_size); /* * If jinode is zero, then we never opened the file for * writing, so there's no need to call * jbd2_journal_begin_ordered_truncate() since there's no * outstanding writes we need to flush. */ if (!EXT4_I(inode)->jinode) return 0; return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode), EXT4_I(inode)->jinode, new_size); } /* * Test whether an inode is a fast symlink. * A fast symlink has its symlink data stored in ext4_inode_info->i_data. */ int ext4_inode_is_fast_symlink(struct inode *inode) { if (!ext4_has_feature_ea_inode(inode->i_sb)) { int ea_blocks = EXT4_I(inode)->i_file_acl ? EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0; if (ext4_has_inline_data(inode)) return 0; return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); } return S_ISLNK(inode->i_mode) && inode->i_size && (inode->i_size < EXT4_N_BLOCKS * 4); } /* * Called at the last iput() if i_nlink is zero. */ void ext4_evict_inode(struct inode *inode) { handle_t *handle; int err; /* * Credits for final inode cleanup and freeing: * sb + inode (ext4_orphan_del()), block bitmap, group descriptor * (xattr block freeing), bitmap, group descriptor (inode freeing) */ int extra_credits = 6; struct ext4_xattr_inode_array *ea_inode_array = NULL; bool freeze_protected = false; trace_ext4_evict_inode(inode); dax_break_layout_final(inode); if (EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL) ext4_evict_ea_inode(inode); if (inode->i_nlink) { truncate_inode_pages_final(&inode->i_data); goto no_delete; } if (is_bad_inode(inode)) goto no_delete; dquot_initialize(inode); if (ext4_should_order_data(inode)) ext4_begin_ordered_truncate(inode, 0); truncate_inode_pages_final(&inode->i_data); /* * For inodes with journalled data, transaction commit could have * dirtied the inode. And for inodes with dioread_nolock, unwritten * extents converting worker could merge extents and also have dirtied * the inode. Flush worker is ignoring it because of I_FREEING flag but * we still need to remove the inode from the writeback lists. */ inode_io_list_del(inode); /* * Protect us against freezing - iput() caller didn't have to have any * protection against it. When we are in a running transaction though, * we are already protected against freezing and we cannot grab further * protection due to lock ordering constraints. */ if (!ext4_journal_current_handle()) { sb_start_intwrite(inode->i_sb); freeze_protected = true; } if (!IS_NOQUOTA(inode)) extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb); /* * Block bitmap, group descriptor, and inode are accounted in both * ext4_blocks_for_truncate() and extra_credits. So subtract 3. */ handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, ext4_blocks_for_truncate(inode) + extra_credits - 3); if (IS_ERR(handle)) { ext4_std_error(inode->i_sb, PTR_ERR(handle)); /* * If we're going to skip the normal cleanup, we still need to * make sure that the in-core orphan linked list is properly * cleaned up. */ ext4_orphan_del(NULL, inode); if (freeze_protected) sb_end_intwrite(inode->i_sb); goto no_delete; } if (IS_SYNC(inode)) ext4_handle_sync(handle); /* * Set inode->i_size to 0 before calling ext4_truncate(). We need * special handling of symlinks here because i_size is used to * determine whether ext4_inode_info->i_data contains symlink data or * block mappings. Setting i_size to 0 will remove its fast symlink * status. Erase i_data so that it becomes a valid empty block map. */ if (ext4_inode_is_fast_symlink(inode)) memset(EXT4_I(inode)->i_data, 0, sizeof(EXT4_I(inode)->i_data)); inode->i_size = 0; err = ext4_mark_inode_dirty(handle, inode); if (err) { ext4_warning(inode->i_sb, "couldn't mark inode dirty (err %d)", err); goto stop_handle; } if (inode->i_blocks) { err = ext4_truncate(inode); if (err) { ext4_error_err(inode->i_sb, -err, "couldn't truncate inode %lu (err %d)", inode->i_ino, err); goto stop_handle; } } /* Remove xattr references. */ err = ext4_xattr_delete_inode(handle, inode, &ea_inode_array, extra_credits); if (err) { ext4_warning(inode->i_sb, "xattr delete (err %d)", err); stop_handle: ext4_journal_stop(handle); ext4_orphan_del(NULL, inode); if (freeze_protected) sb_end_intwrite(inode->i_sb); ext4_xattr_inode_array_free(ea_inode_array); goto no_delete; } /* * Kill off the orphan record which ext4_truncate created. * AKPM: I think this can be inside the above `if'. * Note that ext4_orphan_del() has to be able to cope with the * deletion of a non-existent orphan - this is because we don't * know if ext4_truncate() actually created an orphan record. * (Well, we could do this if we need to, but heck - it works) */ ext4_orphan_del(handle, inode); EXT4_I(inode)->i_dtime = (__u32)ktime_get_real_seconds(); /* * One subtle ordering requirement: if anything has gone wrong * (transaction abort, IO errors, whatever), then we can still * do these next steps (the fs will already have been marked as * having errors), but we can't free the inode if the mark_dirty * fails. */ if (ext4_mark_inode_dirty(handle, inode)) /* If that failed, just do the required in-core inode clear. */ ext4_clear_inode(inode); else ext4_free_inode(handle, inode); ext4_journal_stop(handle); if (freeze_protected) sb_end_intwrite(inode->i_sb); ext4_xattr_inode_array_free(ea_inode_array); return; no_delete: /* * Check out some where else accidentally dirty the evicting inode, * which may probably cause inode use-after-free issues later. */ WARN_ON_ONCE(!list_empty_careful(&inode->i_io_list)); if (!list_empty(&EXT4_I(inode)->i_fc_list)) ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL); ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ } #ifdef CONFIG_QUOTA qsize_t *ext4_get_reserved_space(struct inode *inode) { return &EXT4_I(inode)->i_reserved_quota; } #endif /* * Called with i_data_sem down, which is important since we can call * ext4_discard_preallocations() from here. */ void ext4_da_update_reserve_space(struct inode *inode, int used, int quota_claim) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); spin_lock(&ei->i_block_reservation_lock); trace_ext4_da_update_reserve_space(inode, used, quota_claim); if (unlikely(used > ei->i_reserved_data_blocks)) { ext4_warning(inode->i_sb, "%s: ino %lu, used %d " "with only %d reserved data blocks", __func__, inode->i_ino, used, ei->i_reserved_data_blocks); WARN_ON(1); used = ei->i_reserved_data_blocks; } /* Update per-inode reservations */ ei->i_reserved_data_blocks -= used; percpu_counter_sub(&sbi->s_dirtyclusters_counter, used); spin_unlock(&ei->i_block_reservation_lock); /* Update quota subsystem for data blocks */ if (quota_claim) dquot_claim_block(inode, EXT4_C2B(sbi, used)); else { /* * We did fallocate with an offset that is already delayed * allocated. So on delayed allocated writeback we should * not re-claim the quota for fallocated blocks. */ dquot_release_reservation_block(inode, EXT4_C2B(sbi, used)); } /* * If we have done all the pending block allocations and if * there aren't any writers on the inode, we can discard the * inode's preallocations. */ if ((ei->i_reserved_data_blocks == 0) && !inode_is_open_for_write(inode)) ext4_discard_preallocations(inode); } static int __check_block_validity(struct inode *inode, const char *func, unsigned int line, struct ext4_map_blocks *map) { journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; if (journal && inode == journal->j_inode) return 0; if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) { ext4_error_inode(inode, func, line, map->m_pblk, "lblock %lu mapped to illegal pblock %llu " "(length %d)", (unsigned long) map->m_lblk, map->m_pblk, map->m_len); return -EFSCORRUPTED; } return 0; } int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, ext4_lblk_t len) { int ret; if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) return fscrypt_zeroout_range(inode, lblk, pblk, len); ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS); if (ret > 0) ret = 0; return ret; } /* * For generic regular files, when updating the extent tree, Ext4 should * hold the i_rwsem and invalidate_lock exclusively. This ensures * exclusion against concurrent page faults, as well as reads and writes. */ #ifdef CONFIG_EXT4_DEBUG void ext4_check_map_extents_env(struct inode *inode) { if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return; if (!S_ISREG(inode->i_mode) || IS_NOQUOTA(inode) || IS_VERITY(inode) || is_special_ino(inode->i_sb, inode->i_ino) || (inode_state_read_once(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) || ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) || ext4_verity_in_progress(inode)) return; WARN_ON_ONCE(!inode_is_locked(inode) && !rwsem_is_locked(&inode->i_mapping->invalidate_lock)); } #else void ext4_check_map_extents_env(struct inode *inode) {} #endif #define check_block_validity(inode, map) \ __check_block_validity((inode), __func__, __LINE__, (map)) #ifdef ES_AGGRESSIVE_TEST static void ext4_map_blocks_es_recheck(handle_t *handle, struct inode *inode, struct ext4_map_blocks *es_map, struct ext4_map_blocks *map, int flags) { int retval; map->m_flags = 0; /* * There is a race window that the result is not the same. * e.g. xfstests #223 when dioread_nolock enables. The reason * is that we lookup a block mapping in extent status tree with * out taking i_data_sem. So at the time the unwritten extent * could be converted. */ down_read(&EXT4_I(inode)->i_data_sem); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { retval = ext4_ext_map_blocks(handle, inode, map, 0); } else { retval = ext4_ind_map_blocks(handle, inode, map, 0); } up_read((&EXT4_I(inode)->i_data_sem)); /* * We don't check m_len because extent will be collpased in status * tree. So the m_len might not equal. */ if (es_map->m_lblk != map->m_lblk || es_map->m_flags != map->m_flags || es_map->m_pblk != map->m_pblk) { printk("ES cache assertion failed for inode: %lu " "es_cached ex [%d/%d/%llu/%x] != " "found ex [%d/%d/%llu/%x] retval %d flags %x\n", inode->i_ino, es_map->m_lblk, es_map->m_len, es_map->m_pblk, es_map->m_flags, map->m_lblk, map->m_len, map->m_pblk, map->m_flags, retval, flags); } } #endif /* ES_AGGRESSIVE_TEST */ static int ext4_map_query_blocks_next_in_leaf(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, unsigned int orig_mlen) { struct ext4_map_blocks map2; unsigned int status, status2; int retval; status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; WARN_ON_ONCE(!(map->m_flags & EXT4_MAP_QUERY_LAST_IN_LEAF)); WARN_ON_ONCE(orig_mlen <= map->m_len); /* Prepare map2 for lookup in next leaf block */ map2.m_lblk = map->m_lblk + map->m_len; map2.m_len = orig_mlen - map->m_len; map2.m_flags = 0; retval = ext4_ext_map_blocks(handle, inode, &map2, 0); if (retval <= 0) { ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, status, false); return map->m_len; } if (unlikely(retval != map2.m_len)) { ext4_warning(inode->i_sb, "ES len assertion failed for inode " "%lu: retval %d != map->m_len %d", inode->i_ino, retval, map2.m_len); WARN_ON(1); } status2 = map2.m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; /* * If map2 is contiguous with map, then let's insert it as a single * extent in es cache and return the combined length of both the maps. */ if (map->m_pblk + map->m_len == map2.m_pblk && status == status2) { ext4_es_insert_extent(inode, map->m_lblk, map->m_len + map2.m_len, map->m_pblk, status, false); map->m_len += map2.m_len; } else { ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, status, false); } return map->m_len; } static int ext4_map_query_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags) { unsigned int status; int retval; unsigned int orig_mlen = map->m_len; flags &= EXT4_EX_QUERY_FILTER; if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) retval = ext4_ext_map_blocks(handle, inode, map, flags); else retval = ext4_ind_map_blocks(handle, inode, map, flags); if (retval < 0) return retval; /* A hole? */ if (retval == 0) goto out; if (unlikely(retval != map->m_len)) { ext4_warning(inode->i_sb, "ES len assertion failed for inode " "%lu: retval %d != map->m_len %d", inode->i_ino, retval, map->m_len); WARN_ON(1); } /* * No need to query next in leaf: * - if returned extent is not last in leaf or * - if the last in leaf is the full requested range */ if (!(map->m_flags & EXT4_MAP_QUERY_LAST_IN_LEAF) || map->m_len == orig_mlen) { status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, status, false); } else { retval = ext4_map_query_blocks_next_in_leaf(handle, inode, map, orig_mlen); } out: map->m_seq = READ_ONCE(EXT4_I(inode)->i_es_seq); return retval; } static int ext4_map_create_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags) { struct extent_status es; unsigned int status; int err, retval = 0; /* * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE * indicates that the blocks and quotas has already been * checked when the data was copied into the page cache. */ if (map->m_flags & EXT4_MAP_DELAYED) flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; /* * Here we clear m_flags because after allocating an new extent, * it will be set again. */ map->m_flags &= ~EXT4_MAP_FLAGS; /* * We need to check for EXT4 here because migrate could have * changed the inode type in between. */ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { retval = ext4_ext_map_blocks(handle, inode, map, flags); } else { retval = ext4_ind_map_blocks(handle, inode, map, flags); /* * We allocated new blocks which will result in i_data's * format changing. Force the migrate to fail by clearing * migrate flags. */ if (retval > 0 && map->m_flags & EXT4_MAP_NEW) ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE); } if (retval <= 0) return retval; if (unlikely(retval != map->m_len)) { ext4_warning(inode->i_sb, "ES len assertion failed for inode %lu: " "retval %d != map->m_len %d", inode->i_ino, retval, map->m_len); WARN_ON(1); } /* * We have to zeroout blocks before inserting them into extent * status tree. Otherwise someone could look them up there and * use them before they are really zeroed. We also have to * unmap metadata before zeroing as otherwise writeback can * overwrite zeros with stale data from block device. */ if (flags & EXT4_GET_BLOCKS_ZERO && map->m_flags & EXT4_MAP_MAPPED && map->m_flags & EXT4_MAP_NEW) { err = ext4_issue_zeroout(inode, map->m_lblk, map->m_pblk, map->m_len); if (err) return err; } /* * If the extent has been zeroed out, we don't need to update * extent status tree. */ if (flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE && ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, &map->m_seq)) { if (ext4_es_is_written(&es)) return retval; } status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, status, flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE); map->m_seq = READ_ONCE(EXT4_I(inode)->i_es_seq); return retval; } /* * The ext4_map_blocks() function tries to look up the requested blocks, * and returns if the blocks are already mapped. * * Otherwise it takes the write lock of the i_data_sem and allocate blocks * and store the allocated blocks in the result buffer head and mark it * mapped. * * If file type is extents based, it will call ext4_ext_map_blocks(), * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping * based files * * On success, it returns the number of blocks being mapped or allocated. * If flags doesn't contain EXT4_GET_BLOCKS_CREATE the blocks are * pre-allocated and unwritten, the resulting @map is marked as unwritten. * If the flags contain EXT4_GET_BLOCKS_CREATE, it will mark @map as mapped. * * It returns 0 if plain look up failed (blocks have not been allocated), in * that case, @map is returned as unmapped but we still do fill map->m_len to * indicate the length of a hole starting at map->m_lblk. * * It returns the error in case of allocation failure. */ int ext4_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags) { struct extent_status es; int retval; int ret = 0; unsigned int orig_mlen = map->m_len; #ifdef ES_AGGRESSIVE_TEST struct ext4_map_blocks orig_map; memcpy(&orig_map, map, sizeof(*map)); #endif map->m_flags = 0; ext_debug(inode, "flag 0x%x, max_blocks %u, logical block %lu\n", flags, map->m_len, (unsigned long) map->m_lblk); /* * ext4_map_blocks returns an int, and m_len is an unsigned int */ if (unlikely(map->m_len > INT_MAX)) map->m_len = INT_MAX; /* We can handle the block number less than EXT_MAX_BLOCKS */ if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS)) return -EFSCORRUPTED; /* * Callers from the context of data submission are the only exceptions * for regular files that do not hold the i_rwsem or invalidate_lock. * However, caching unrelated ranges is not permitted. */ if (flags & EXT4_GET_BLOCKS_IO_SUBMIT) WARN_ON_ONCE(!(flags & EXT4_EX_NOCACHE)); else ext4_check_map_extents_env(inode); /* Lookup extent status tree firstly */ if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, &map->m_seq)) { if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk; map->m_flags |= ext4_es_is_written(&es) ? EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN; retval = es.es_len - (map->m_lblk - es.es_lblk); if (retval > map->m_len) retval = map->m_len; map->m_len = retval; } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) { map->m_pblk = 0; map->m_flags |= ext4_es_is_delayed(&es) ? EXT4_MAP_DELAYED : 0; retval = es.es_len - (map->m_lblk - es.es_lblk); if (retval > map->m_len) retval = map->m_len; map->m_len = retval; retval = 0; } else { BUG(); } if (flags & EXT4_GET_BLOCKS_CACHED_NOWAIT) return retval; #ifdef ES_AGGRESSIVE_TEST ext4_map_blocks_es_recheck(handle, inode, map, &orig_map, flags); #endif if (!(flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) || orig_mlen == map->m_len) goto found; map->m_len = orig_mlen; } /* * In the query cache no-wait mode, nothing we can do more if we * cannot find extent in the cache. */ if (flags & EXT4_GET_BLOCKS_CACHED_NOWAIT) return 0; /* * Try to see if we can get the block without requesting a new * file system block. */ down_read(&EXT4_I(inode)->i_data_sem); retval = ext4_map_query_blocks(handle, inode, map, flags); up_read((&EXT4_I(inode)->i_data_sem)); found: if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { ret = check_block_validity(inode, map); if (ret != 0) return ret; } /* If it is only a block(s) look up */ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) return retval; /* * Returns if the blocks have already allocated * * Note that if blocks have been preallocated * ext4_ext_map_blocks() returns with buffer head unmapped */ if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) /* * If we need to convert extent to unwritten * we continue and do the actual work in * ext4_ext_map_blocks() */ if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) return retval; ext4_fc_track_inode(handle, inode); /* * New blocks allocate and/or writing to unwritten extent * will possibly result in updating i_data, so we take * the write lock of i_data_sem, and call get_block() * with create == 1 flag. */ down_write(&EXT4_I(inode)->i_data_sem); retval = ext4_map_create_blocks(handle, inode, map, flags); up_write((&EXT4_I(inode)->i_data_sem)); if (retval < 0) ext_debug(inode, "failed with err %d\n", retval); if (retval <= 0) return retval; if (map->m_flags & EXT4_MAP_MAPPED) { ret = check_block_validity(inode, map); if (ret != 0) return ret; /* * Inodes with freshly allocated blocks where contents will be * visible after transaction commit must be on transaction's * ordered data list. */ if (map->m_flags & EXT4_MAP_NEW && !(map->m_flags & EXT4_MAP_UNWRITTEN) && !(flags & EXT4_GET_BLOCKS_ZERO) && !ext4_is_quota_file(inode) && ext4_should_order_data(inode)) { loff_t start_byte = EXT4_LBLK_TO_B(inode, map->m_lblk); loff_t length = EXT4_LBLK_TO_B(inode, map->m_len); if (flags & EXT4_GET_BLOCKS_IO_SUBMIT) ret = ext4_jbd2_inode_add_wait(handle, inode, start_byte, length); else ret = ext4_jbd2_inode_add_write(handle, inode, start_byte, length); if (ret) return ret; } } ext4_fc_track_range(handle, inode, map->m_lblk, map->m_lblk + map->m_len - 1); return retval; } /* * Update EXT4_MAP_FLAGS in bh->b_state. For buffer heads attached to pages * we have to be careful as someone else may be manipulating b_state as well. */ static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags) { unsigned long old_state; unsigned long new_state; flags &= EXT4_MAP_FLAGS; /* Dummy buffer_head? Set non-atomically. */ if (!bh->b_folio) { bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | flags; return; } /* * Someone else may be modifying b_state. Be careful! This is ugly but * once we get rid of using bh as a container for mapping information * to pass to / from get_block functions, this can go away. */ old_state = READ_ONCE(bh->b_state); do { new_state = (old_state & ~EXT4_MAP_FLAGS) | flags; } while (unlikely(!try_cmpxchg(&bh->b_state, &old_state, new_state))); } /* * Make sure that the current journal transaction has enough credits to map * one extent. Return -EAGAIN if it cannot extend the current running * transaction. */ static inline int ext4_journal_ensure_extent_credits(handle_t *handle, struct inode *inode) { int credits; int ret; /* Called from ext4_da_write_begin() which has no handle started? */ if (!handle) return 0; credits = ext4_chunk_trans_blocks(inode, 1); ret = __ext4_journal_ensure_credits(handle, credits, credits, 0); return ret <= 0 ? ret : -EAGAIN; } static int _ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int flags) { struct ext4_map_blocks map; int ret = 0; if (ext4_has_inline_data(inode)) return -ERANGE; map.m_lblk = iblock; map.m_len = bh->b_size >> inode->i_blkbits; ret = ext4_map_blocks(ext4_journal_current_handle(), inode, &map, flags); if (ret > 0) { map_bh(bh, inode->i_sb, map.m_pblk); ext4_update_bh_state(bh, map.m_flags); bh->b_size = inode->i_sb->s_blocksize * map.m_len; ret = 0; } else if (ret == 0) { /* hole case, need to fill in bh->b_size */ bh->b_size = inode->i_sb->s_blocksize * map.m_len; } return ret; } int ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create) { return _ext4_get_block(inode, iblock, bh, create ? EXT4_GET_BLOCKS_CREATE : 0); } /* * Get block function used when preparing for buffered write if we require * creating an unwritten extent if blocks haven't been allocated. The extent * will be converted to written after the IO is complete. */ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { int ret = 0; ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n", inode->i_ino, create); ret = _ext4_get_block(inode, iblock, bh_result, EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT); /* * If the buffer is marked unwritten, mark it as new to make sure it is * zeroed out correctly in case of partial writes. Otherwise, there is * a chance of stale data getting exposed. */ if (ret == 0 && buffer_unwritten(bh_result)) set_buffer_new(bh_result); return ret; } /* Maximum number of blocks we map for direct IO at once. */ #define DIO_MAX_BLOCKS 4096 /* * `handle' can be NULL if create is zero */ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, ext4_lblk_t block, int map_flags) { struct ext4_map_blocks map; struct buffer_head *bh; int create = map_flags & EXT4_GET_BLOCKS_CREATE; bool nowait = map_flags & EXT4_GET_BLOCKS_CACHED_NOWAIT; int err; ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) || handle != NULL || create == 0); ASSERT(create == 0 || !nowait); map.m_lblk = block; map.m_len = 1; err = ext4_map_blocks(handle, inode, &map, map_flags); if (err == 0) return create ? ERR_PTR(-ENOSPC) : NULL; if (err < 0) return ERR_PTR(err); if (nowait) return sb_find_get_block(inode->i_sb, map.m_pblk); /* * Since bh could introduce extra ref count such as referred by * journal_head etc. Try to avoid using __GFP_MOVABLE here * as it may fail the migration when journal_head remains. */ bh = getblk_unmovable(inode->i_sb->s_bdev, map.m_pblk, inode->i_sb->s_blocksize); if (unlikely(!bh)) return ERR_PTR(-ENOMEM); if (map.m_flags & EXT4_MAP_NEW) { ASSERT(create != 0); ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) || (handle != NULL)); /* * Now that we do not always journal data, we should * keep in mind whether this should always journal the * new buffer as metadata. For now, regular file * writes use ext4_get_block instead, so it's not a * problem. */ lock_buffer(bh); BUFFER_TRACE(bh, "call get_create_access"); err = ext4_journal_get_create_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); if (unlikely(err)) { unlock_buffer(bh); goto errout; } if (!buffer_uptodate(bh)) { memset(bh->b_data, 0, inode->i_sb->s_blocksize); set_buffer_uptodate(bh); } unlock_buffer(bh); BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, inode, bh); if (unlikely(err)) goto errout; } else BUFFER_TRACE(bh, "not a new buffer"); return bh; errout: brelse(bh); return ERR_PTR(err); } struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, ext4_lblk_t block, int map_flags) { struct buffer_head *bh; int ret; bh = ext4_getblk(handle, inode, block, map_flags); if (IS_ERR(bh)) return bh; if (!bh || ext4_buffer_uptodate(bh)) return bh; ret = ext4_read_bh_lock(bh, REQ_META | REQ_PRIO, true); if (ret) { put_bh(bh); return ERR_PTR(ret); } return bh; } /* Read a contiguous batch of blocks. */ int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count, bool wait, struct buffer_head **bhs) { int i, err; for (i = 0; i < bh_count; i++) { bhs[i] = ext4_getblk(NULL, inode, block + i, 0 /* map_flags */); if (IS_ERR(bhs[i])) { err = PTR_ERR(bhs[i]); bh_count = i; goto out_brelse; } } for (i = 0; i < bh_count; i++) /* Note that NULL bhs[i] is valid because of holes. */ if (bhs[i] && !ext4_buffer_uptodate(bhs[i])) ext4_read_bh_lock(bhs[i], REQ_META | REQ_PRIO, false); if (!wait) return 0; for (i = 0; i < bh_count; i++) if (bhs[i]) wait_on_buffer(bhs[i]); for (i = 0; i < bh_count; i++) { if (bhs[i] && !buffer_uptodate(bhs[i])) { err = -EIO; goto out_brelse; } } return 0; out_brelse: for (i = 0; i < bh_count; i++) { brelse(bhs[i]); bhs[i] = NULL; } return err; } int ext4_walk_page_buffers(handle_t *handle, struct inode *inode, struct buffer_head *head, unsigned from, unsigned to, int *partial, int (*fn)(handle_t *handle, struct inode *inode, struct buffer_head *bh)) { struct buffer_head *bh; unsigned block_start, block_end; unsigned blocksize = head->b_size; int err, ret = 0; struct buffer_head *next; for (bh = head, block_start = 0; ret == 0 && (bh != head || !block_start); block_start = block_end, bh = next) { next = bh->b_this_page; block_end = block_start + blocksize; if (block_end <= from || block_start >= to) { if (partial && !buffer_uptodate(bh)) *partial = 1; continue; } err = (*fn)(handle, inode, bh); if (!ret) ret = err; } return ret; } /* * Helper for handling dirtying of journalled data. We also mark the folio as * dirty so that writeback code knows about this page (and inode) contains * dirty data. ext4_writepages() then commits appropriate transaction to * make data stable. */ static int ext4_dirty_journalled_data(handle_t *handle, struct buffer_head *bh) { struct folio *folio = bh->b_folio; struct inode *inode = folio->mapping->host; /* only regular files have a_ops */ if (S_ISREG(inode->i_mode)) folio_mark_dirty(folio); return ext4_handle_dirty_metadata(handle, NULL, bh); } int do_journal_get_write_access(handle_t *handle, struct inode *inode, struct buffer_head *bh) { if (!buffer_mapped(bh) || buffer_freed(bh)) return 0; BUFFER_TRACE(bh, "get write access"); return ext4_journal_get_write_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); } int ext4_block_write_begin(handle_t *handle, struct folio *folio, loff_t pos, unsigned len, get_block_t *get_block) { unsigned int from = offset_in_folio(folio, pos); unsigned to = from + len; struct inode *inode = folio->mapping->host; unsigned block_start, block_end; sector_t block; int err = 0; unsigned int blocksize = i_blocksize(inode); struct buffer_head *bh, *head, *wait[2]; int nr_wait = 0; int i; bool should_journal_data = ext4_should_journal_data(inode); BUG_ON(!folio_test_locked(folio)); BUG_ON(to > folio_size(folio)); BUG_ON(from > to); WARN_ON_ONCE(blocksize > folio_size(folio)); head = folio_buffers(folio); if (!head) head = create_empty_buffers(folio, blocksize, 0); block = EXT4_PG_TO_LBLK(inode, folio->index); for (bh = head, block_start = 0; bh != head || !block_start; block++, block_start = block_end, bh = bh->b_this_page) { block_end = block_start + blocksize; if (block_end <= from || block_start >= to) { if (folio_test_uptodate(folio)) { set_buffer_uptodate(bh); } continue; } if (WARN_ON_ONCE(buffer_new(bh))) clear_buffer_new(bh); if (!buffer_mapped(bh)) { WARN_ON(bh->b_size != blocksize); err = ext4_journal_ensure_extent_credits(handle, inode); if (!err) err = get_block(inode, block, bh, 1); if (err) break; if (buffer_new(bh)) { /* * We may be zeroing partial buffers or all new * buffers in case of failure. Prepare JBD2 for * that. */ if (should_journal_data) do_journal_get_write_access(handle, inode, bh); if (folio_test_uptodate(folio)) { /* * Unlike __block_write_begin() we leave * dirtying of new uptodate buffers to * ->write_end() time or * folio_zero_new_buffers(). */ set_buffer_uptodate(bh); continue; } if (block_end > to || block_start < from) folio_zero_segments(folio, to, block_end, block_start, from); continue; } } if (folio_test_uptodate(folio)) { set_buffer_uptodate(bh); continue; } if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh) && (block_start < from || block_end > to)) { ext4_read_bh_lock(bh, 0, false); wait[nr_wait++] = bh; } } /* * If we issued read requests, let them complete. */ for (i = 0; i < nr_wait; i++) { wait_on_buffer(wait[i]); if (!buffer_uptodate(wait[i])) err = -EIO; } if (unlikely(err)) { if (should_journal_data) ext4_journalled_zero_new_buffers(handle, inode, folio, from, to); else folio_zero_new_buffers(folio, from, to); } else if (fscrypt_inode_uses_fs_layer_crypto(inode)) { for (i = 0; i < nr_wait; i++) { int err2; err2 = fscrypt_decrypt_pagecache_blocks(folio, blocksize, bh_offset(wait[i])); if (err2) { clear_buffer_uptodate(wait[i]); err = err2; } } } return err; } /* * To preserve ordering, it is essential that the hole instantiation and * the data write be encapsulated in a single transaction. We cannot * close off a transaction and start a new one between the ext4_get_block() * and the ext4_write_end(). So doing the jbd2_journal_start at the start of * ext4_write_begin() is the right place. */ static int ext4_write_begin(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { struct inode *inode = mapping->host; int ret, needed_blocks; handle_t *handle; int retries = 0; struct folio *folio; pgoff_t index; unsigned from, to; ret = ext4_emergency_state(inode->i_sb); if (unlikely(ret)) return ret; trace_ext4_write_begin(inode, pos, len); /* * Reserve one block more for addition to orphan list in case * we allocate blocks but write fails for some reason */ needed_blocks = ext4_chunk_trans_extent(inode, ext4_journal_blocks_per_folio(inode)) + 1; index = pos >> PAGE_SHIFT; if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { ret = ext4_try_to_write_inline_data(mapping, inode, pos, len, foliop); if (ret < 0) return ret; if (ret == 1) return 0; } /* * write_begin_get_folio() can take a long time if the * system is thrashing due to memory pressure, or if the folio * is being written back. So grab it first before we start * the transaction handle. This also allows us to allocate * the folio (if needed) without using GFP_NOFS. */ retry_grab: folio = write_begin_get_folio(iocb, mapping, index, len); if (IS_ERR(folio)) return PTR_ERR(folio); if (len > folio_next_pos(folio) - pos) len = folio_next_pos(folio) - pos; from = offset_in_folio(folio, pos); to = from + len; /* * The same as page allocation, we prealloc buffer heads before * starting the handle. */ if (!folio_buffers(folio)) create_empty_buffers(folio, inode->i_sb->s_blocksize, 0); folio_unlock(folio); retry_journal: handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks); if (IS_ERR(handle)) { folio_put(folio); return PTR_ERR(handle); } folio_lock(folio); if (folio->mapping != mapping) { /* The folio got truncated from under us */ folio_unlock(folio); folio_put(folio); ext4_journal_stop(handle); goto retry_grab; } /* In case writeback began while the folio was unlocked */ folio_wait_stable(folio); if (ext4_should_dioread_nolock(inode)) ret = ext4_block_write_begin(handle, folio, pos, len, ext4_get_block_unwritten); else ret = ext4_block_write_begin(handle, folio, pos, len, ext4_get_block); if (!ret && ext4_should_journal_data(inode)) { ret = ext4_walk_page_buffers(handle, inode, folio_buffers(folio), from, to, NULL, do_journal_get_write_access); } if (ret) { bool extended = (pos + len > inode->i_size) && !ext4_verity_in_progress(inode); folio_unlock(folio); /* * ext4_block_write_begin may have instantiated a few blocks * outside i_size. Trim these off again. Don't need * i_size_read because we hold i_rwsem. * * Add inode to orphan list in case we crash before * truncate finishes */ if (extended && ext4_can_truncate(inode)) ext4_orphan_add(handle, inode); ext4_journal_stop(handle); if (extended) { ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might * still be on the orphan list; we need to * make sure the inode is removed from the * orphan list in that case. */ if (inode->i_nlink) ext4_orphan_del(NULL, inode); } if (ret == -EAGAIN || (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))) goto retry_journal; folio_put(folio); return ret; } *foliop = folio; return ret; } /* For write_end() in data=journal mode */ static int write_end_fn(handle_t *handle, struct inode *inode, struct buffer_head *bh) { int ret; if (!buffer_mapped(bh) || buffer_freed(bh)) return 0; set_buffer_uptodate(bh); ret = ext4_dirty_journalled_data(handle, bh); clear_buffer_meta(bh); clear_buffer_prio(bh); clear_buffer_new(bh); return ret; } /* * We need to pick up the new inode size which generic_commit_write gave us * `iocb` can be NULL - eg, when called from page_symlink(). * * ext4 never places buffers on inode->i_mapping->i_private_list. metadata * buffers are managed internally. */ static int ext4_write_end(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { handle_t *handle = ext4_journal_current_handle(); struct inode *inode = mapping->host; loff_t old_size = inode->i_size; int ret = 0, ret2; int i_size_changed = 0; bool verity = ext4_verity_in_progress(inode); trace_ext4_write_end(inode, pos, len, copied); if (ext4_has_inline_data(inode) && ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) return ext4_write_inline_data_end(inode, pos, len, copied, folio); copied = block_write_end(pos, len, copied, folio); /* * it's important to update i_size while still holding folio lock: * page writeout could otherwise come in and zero beyond i_size. * * If FS_IOC_ENABLE_VERITY is running on this inode, then Merkle tree * blocks are being written past EOF, so skip the i_size update. */ if (!verity) i_size_changed = ext4_update_inode_size(inode, pos + copied); folio_unlock(folio); folio_put(folio); if (old_size < pos && !verity) { pagecache_isize_extended(inode, old_size, pos); ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size); } /* * Don't mark the inode dirty under folio lock. First, it unnecessarily * makes the holding time of folio lock longer. Second, it forces lock * ordering of folio lock and transaction start for journaling * filesystems. */ if (i_size_changed) ret = ext4_mark_inode_dirty(handle, inode); if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode)) /* if we have allocated more blocks and copied * less. We will have blocks allocated outside * inode->i_size. So truncate them */ ext4_orphan_add(handle, inode); ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; if (pos + len > inode->i_size && !verity) { ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might still be * on the orphan list; we need to make sure the inode * is removed from the orphan list in that case. */ if (inode->i_nlink) ext4_orphan_del(NULL, inode); } return ret ? ret : copied; } /* * This is a private version of folio_zero_new_buffers() which doesn't * set the buffer to be dirty, since in data=journalled mode we need * to call ext4_dirty_journalled_data() instead. */ static void ext4_journalled_zero_new_buffers(handle_t *handle, struct inode *inode, struct folio *folio, unsigned from, unsigned to) { unsigned int block_start = 0, block_end; struct buffer_head *head, *bh; bh = head = folio_buffers(folio); do { block_end = block_start + bh->b_size; if (buffer_new(bh)) { if (block_end > from && block_start < to) { if (!folio_test_uptodate(folio)) { unsigned start, size; start = max(from, block_start); size = min(to, block_end) - start; folio_zero_range(folio, start, size); } clear_buffer_new(bh); write_end_fn(handle, inode, bh); } } block_start = block_end; bh = bh->b_this_page; } while (bh != head); } static int ext4_journalled_write_end(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { handle_t *handle = ext4_journal_current_handle(); struct inode *inode = mapping->host; loff_t old_size = inode->i_size; int ret = 0, ret2; int partial = 0; unsigned from, to; int size_changed = 0; bool verity = ext4_verity_in_progress(inode); trace_ext4_journalled_write_end(inode, pos, len, copied); from = pos & (PAGE_SIZE - 1); to = from + len; BUG_ON(!ext4_handle_valid(handle)); if (ext4_has_inline_data(inode)) return ext4_write_inline_data_end(inode, pos, len, copied, folio); if (unlikely(copied < len) && !folio_test_uptodate(folio)) { copied = 0; ext4_journalled_zero_new_buffers(handle, inode, folio, from, to); } else { if (unlikely(copied < len)) ext4_journalled_zero_new_buffers(handle, inode, folio, from + copied, to); ret = ext4_walk_page_buffers(handle, inode, folio_buffers(folio), from, from + copied, &partial, write_end_fn); if (!partial) folio_mark_uptodate(folio); } if (!verity) size_changed = ext4_update_inode_size(inode, pos + copied); EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; folio_unlock(folio); folio_put(folio); if (old_size < pos && !verity) { pagecache_isize_extended(inode, old_size, pos); ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size); } if (size_changed) { ret2 = ext4_mark_inode_dirty(handle, inode); if (!ret) ret = ret2; } if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode)) /* if we have allocated more blocks and copied * less. We will have blocks allocated outside * inode->i_size. So truncate them */ ext4_orphan_add(handle, inode); ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; if (pos + len > inode->i_size && !verity) { ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might still be * on the orphan list; we need to make sure the inode * is removed from the orphan list in that case. */ if (inode->i_nlink) ext4_orphan_del(NULL, inode); } return ret ? ret : copied; } /* * Reserve space for 'nr_resv' clusters */ static int ext4_da_reserve_space(struct inode *inode, int nr_resv) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); int ret; /* * We will charge metadata quota at writeout time; this saves * us from metadata over-estimation, though we may go over by * a small amount in the end. Here we just reserve for data. */ ret = dquot_reserve_block(inode, EXT4_C2B(sbi, nr_resv)); if (ret) return ret; spin_lock(&ei->i_block_reservation_lock); if (ext4_claim_free_clusters(sbi, nr_resv, 0)) { spin_unlock(&ei->i_block_reservation_lock); dquot_release_reservation_block(inode, EXT4_C2B(sbi, nr_resv)); return -ENOSPC; } ei->i_reserved_data_blocks += nr_resv; trace_ext4_da_reserve_space(inode, nr_resv); spin_unlock(&ei->i_block_reservation_lock); return 0; /* success */ } void ext4_da_release_space(struct inode *inode, int to_free) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); if (!to_free) return; /* Nothing to release, exit */ spin_lock(&EXT4_I(inode)->i_block_reservation_lock); trace_ext4_da_release_space(inode, to_free); if (unlikely(to_free > ei->i_reserved_data_blocks)) { /* * if there aren't enough reserved blocks, then the * counter is messed up somewhere. Since this * function is called from invalidate page, it's * harmless to return without any action. */ ext4_warning(inode->i_sb, "ext4_da_release_space: " "ino %lu, to_free %d with only %d reserved " "data blocks", inode->i_ino, to_free, ei->i_reserved_data_blocks); WARN_ON(1); to_free = ei->i_reserved_data_blocks; } ei->i_reserved_data_blocks -= to_free; /* update fs dirty data blocks counter */ percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free); spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free)); } /* * Delayed allocation stuff */ struct mpage_da_data { /* These are input fields for ext4_do_writepages() */ struct inode *inode; struct writeback_control *wbc; unsigned int can_map:1; /* Can writepages call map blocks? */ /* These are internal state of ext4_do_writepages() */ loff_t start_pos; /* The start pos to write */ loff_t next_pos; /* Current pos to examine */ loff_t end_pos; /* Last pos to examine */ /* * Extent to map - this can be after start_pos because that can be * fully mapped. We somewhat abuse m_flags to store whether the extent * is delalloc or unwritten. */ struct ext4_map_blocks map; struct ext4_io_submit io_submit; /* IO submission data */ unsigned int do_map:1; unsigned int scanned_until_end:1; unsigned int journalled_more_data:1; }; static void mpage_release_unused_pages(struct mpage_da_data *mpd, bool invalidate) { unsigned nr, i; pgoff_t index, end; struct folio_batch fbatch; struct inode *inode = mpd->inode; struct address_space *mapping = inode->i_mapping; /* This is necessary when next_pos == 0. */ if (mpd->start_pos >= mpd->next_pos) return; mpd->scanned_until_end = 0; if (invalidate) { ext4_lblk_t start, last; start = EXT4_B_TO_LBLK(inode, mpd->start_pos); last = mpd->next_pos >> inode->i_blkbits; /* * avoid racing with extent status tree scans made by * ext4_insert_delayed_block() */ down_write(&EXT4_I(inode)->i_data_sem); ext4_es_remove_extent(inode, start, last - start); up_write(&EXT4_I(inode)->i_data_sem); } folio_batch_init(&fbatch); index = mpd->start_pos >> PAGE_SHIFT; end = mpd->next_pos >> PAGE_SHIFT; while (index < end) { nr = filemap_get_folios(mapping, &index, end - 1, &fbatch); if (nr == 0) break; for (i = 0; i < nr; i++) { struct folio *folio = fbatch.folios[i]; if (folio_pos(folio) < mpd->start_pos) continue; if (folio_next_index(folio) > end) continue; BUG_ON(!folio_test_locked(folio)); BUG_ON(folio_test_writeback(folio)); if (invalidate) { if (folio_mapped(folio)) folio_clear_dirty_for_io(folio); block_invalidate_folio(folio, 0, folio_size(folio)); folio_clear_uptodate(folio); } folio_unlock(folio); } folio_batch_release(&fbatch); } } static void ext4_print_free_blocks(struct inode *inode) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct super_block *sb = inode->i_sb; struct ext4_inode_info *ei = EXT4_I(inode); ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld", EXT4_C2B(EXT4_SB(inode->i_sb), ext4_count_free_clusters(sb))); ext4_msg(sb, KERN_CRIT, "Free/Dirty block details"); ext4_msg(sb, KERN_CRIT, "free_blocks=%lld", (long long) EXT4_C2B(EXT4_SB(sb), percpu_counter_sum(&sbi->s_freeclusters_counter))); ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld", (long long) EXT4_C2B(EXT4_SB(sb), percpu_counter_sum(&sbi->s_dirtyclusters_counter))); ext4_msg(sb, KERN_CRIT, "Block reservation details"); ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u", ei->i_reserved_data_blocks); return; } /* * Check whether the cluster containing lblk has been allocated or has * delalloc reservation. * * Returns 0 if the cluster doesn't have either, 1 if it has delalloc * reservation, 2 if it's already been allocated, negative error code on * failure. */ static int ext4_clu_alloc_state(struct inode *inode, ext4_lblk_t lblk) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int ret; /* Has delalloc reservation? */ if (ext4_es_scan_clu(inode, &ext4_es_is_delayed, lblk)) return 1; /* Already been allocated? */ if (ext4_es_scan_clu(inode, &ext4_es_is_mapped, lblk)) return 2; ret = ext4_clu_mapped(inode, EXT4_B2C(sbi, lblk)); if (ret < 0) return ret; if (ret > 0) return 2; return 0; } /* * ext4_insert_delayed_blocks - adds a multiple delayed blocks to the extents * status tree, incrementing the reserved * cluster/block count or making pending * reservations where needed * * @inode - file containing the newly added block * @lblk - start logical block to be added * @len - length of blocks to be added * * Returns 0 on success, negative error code on failure. */ static int ext4_insert_delayed_blocks(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int ret; bool lclu_allocated = false; bool end_allocated = false; ext4_lblk_t resv_clu; ext4_lblk_t end = lblk + len - 1; /* * If the cluster containing lblk or end is shared with a delayed, * written, or unwritten extent in a bigalloc file system, it's * already been accounted for and does not need to be reserved. * A pending reservation must be made for the cluster if it's * shared with a written or unwritten extent and doesn't already * have one. Written and unwritten extents can be purged from the * extents status tree if the system is under memory pressure, so * it's necessary to examine the extent tree if a search of the * extents status tree doesn't get a match. */ if (sbi->s_cluster_ratio == 1) { ret = ext4_da_reserve_space(inode, len); if (ret != 0) /* ENOSPC */ return ret; } else { /* bigalloc */ resv_clu = EXT4_B2C(sbi, end) - EXT4_B2C(sbi, lblk) + 1; ret = ext4_clu_alloc_state(inode, lblk); if (ret < 0) return ret; if (ret > 0) { resv_clu--; lclu_allocated = (ret == 2); } if (EXT4_B2C(sbi, lblk) != EXT4_B2C(sbi, end)) { ret = ext4_clu_alloc_state(inode, end); if (ret < 0) return ret; if (ret > 0) { resv_clu--; end_allocated = (ret == 2); } } if (resv_clu) { ret = ext4_da_reserve_space(inode, resv_clu); if (ret != 0) /* ENOSPC */ return ret; } } ext4_es_insert_delayed_extent(inode, lblk, len, lclu_allocated, end_allocated); return 0; } /* * Looks up the requested blocks and sets the delalloc extent map. * First try to look up for the extent entry that contains the requested * blocks in the extent status tree without i_data_sem, then try to look * up for the ondisk extent mapping with i_data_sem in read mode, * finally hold i_data_sem in write mode, looks up again and add a * delalloc extent entry if it still couldn't find any extent. Pass out * the mapped extent through @map and return 0 on success. */ static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map) { struct extent_status es; int retval; #ifdef ES_AGGRESSIVE_TEST struct ext4_map_blocks orig_map; memcpy(&orig_map, map, sizeof(*map)); #endif map->m_flags = 0; ext_debug(inode, "max_blocks %u, logical block %lu\n", map->m_len, (unsigned long) map->m_lblk); ext4_check_map_extents_env(inode); /* Lookup extent status tree firstly */ if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, NULL)) { map->m_len = min_t(unsigned int, map->m_len, es.es_len - (map->m_lblk - es.es_lblk)); if (ext4_es_is_hole(&es)) goto add_delayed; found: /* * Delayed extent could be allocated by fallocate. * So we need to check it. */ if (ext4_es_is_delayed(&es)) { map->m_flags |= EXT4_MAP_DELAYED; return 0; } map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk; if (ext4_es_is_written(&es)) map->m_flags |= EXT4_MAP_MAPPED; else if (ext4_es_is_unwritten(&es)) map->m_flags |= EXT4_MAP_UNWRITTEN; else BUG(); #ifdef ES_AGGRESSIVE_TEST ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0); #endif return 0; } /* * Try to see if we can get the block without requesting a new * file system block. */ down_read(&EXT4_I(inode)->i_data_sem); if (ext4_has_inline_data(inode)) retval = 0; else retval = ext4_map_query_blocks(NULL, inode, map, 0); up_read(&EXT4_I(inode)->i_data_sem); if (retval) return retval < 0 ? retval : 0; add_delayed: down_write(&EXT4_I(inode)->i_data_sem); /* * Page fault path (ext4_page_mkwrite does not take i_rwsem) * and fallocate path (no folio lock) can race. Make sure we * lookup the extent status tree here again while i_data_sem * is held in write mode, before inserting a new da entry in * the extent status tree. */ if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, NULL)) { map->m_len = min_t(unsigned int, map->m_len, es.es_len - (map->m_lblk - es.es_lblk)); if (!ext4_es_is_hole(&es)) { up_write(&EXT4_I(inode)->i_data_sem); goto found; } } else if (!ext4_has_inline_data(inode)) { retval = ext4_map_query_blocks(NULL, inode, map, 0); if (retval) { up_write(&EXT4_I(inode)->i_data_sem); return retval < 0 ? retval : 0; } } map->m_flags |= EXT4_MAP_DELAYED; retval = ext4_insert_delayed_blocks(inode, map->m_lblk, map->m_len); if (!retval) map->m_seq = READ_ONCE(EXT4_I(inode)->i_es_seq); up_write(&EXT4_I(inode)->i_data_sem); return retval; } /* * This is a special get_block_t callback which is used by * ext4_da_write_begin(). It will either return mapped block or * reserve space for a single block. * * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set. * We also have b_blocknr = -1 and b_bdev initialized properly * * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set. * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev * initialized properly. */ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create) { struct ext4_map_blocks map; sector_t invalid_block = ~((sector_t) 0xffff); int ret = 0; BUG_ON(create == 0); BUG_ON(bh->b_size != inode->i_sb->s_blocksize); if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es)) invalid_block = ~0; map.m_lblk = iblock; map.m_len = 1; /* * first, we need to know whether the block is allocated already * preallocated blocks are unmapped but should treated * the same as allocated blocks. */ ret = ext4_da_map_blocks(inode, &map); if (ret < 0) return ret; if (map.m_flags & EXT4_MAP_DELAYED) { map_bh(bh, inode->i_sb, invalid_block); set_buffer_new(bh); set_buffer_delay(bh); return 0; } map_bh(bh, inode->i_sb, map.m_pblk); ext4_update_bh_state(bh, map.m_flags); if (buffer_unwritten(bh)) { /* A delayed write to unwritten bh should be marked * new and mapped. Mapped ensures that we don't do * get_block multiple times when we write to the same * offset and new ensures that we do proper zero out * for partial write. */ set_buffer_new(bh); set_buffer_mapped(bh); } return 0; } static void mpage_folio_done(struct mpage_da_data *mpd, struct folio *folio) { mpd->start_pos += folio_size(folio); mpd->wbc->nr_to_write -= folio_nr_pages(folio); folio_unlock(folio); } static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio) { size_t len; loff_t size; int err; WARN_ON_ONCE(folio_pos(folio) != mpd->start_pos); folio_clear_dirty_for_io(folio); /* * We have to be very careful here! Nothing protects writeback path * against i_size changes and the page can be writeably mapped into * page tables. So an application can be growing i_size and writing * data through mmap while writeback runs. folio_clear_dirty_for_io() * write-protects our page in page tables and the page cannot get * written to again until we release folio lock. So only after * folio_clear_dirty_for_io() we are safe to sample i_size for * ext4_bio_write_folio() to zero-out tail of the written page. We rely * on the barrier provided by folio_test_clear_dirty() in * folio_clear_dirty_for_io() to make sure i_size is really sampled only * after page tables are updated. */ size = i_size_read(mpd->inode); len = folio_size(folio); if (folio_pos(folio) + len > size && !ext4_verity_in_progress(mpd->inode)) len = size & (len - 1); err = ext4_bio_write_folio(&mpd->io_submit, folio, len); return err; } #define BH_FLAGS (BIT(BH_Unwritten) | BIT(BH_Delay)) /* * mballoc gives us at most this number of blocks... * XXX: That seems to be only a limitation of ext4_mb_normalize_request(). * The rest of mballoc seems to handle chunks up to full group size. */ #define MAX_WRITEPAGES_EXTENT_LEN 2048 /* * mpage_add_bh_to_extent - try to add bh to extent of blocks to map * * @mpd - extent of blocks * @lblk - logical number of the block in the file * @bh - buffer head we want to add to the extent * * The function is used to collect contig. blocks in the same state. If the * buffer doesn't require mapping for writeback and we haven't started the * extent of buffers to map yet, the function returns 'true' immediately - the * caller can write the buffer right away. Otherwise the function returns true * if the block has been added to the extent, false if the block couldn't be * added. */ static bool mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk, struct buffer_head *bh) { struct ext4_map_blocks *map = &mpd->map; /* Buffer that doesn't need mapping for writeback? */ if (!buffer_dirty(bh) || !buffer_mapped(bh) || (!buffer_delay(bh) && !buffer_unwritten(bh))) { /* So far no extent to map => we write the buffer right away */ if (map->m_len == 0) return true; return false; } /* First block in the extent? */ if (map->m_len == 0) { /* We cannot map unless handle is started... */ if (!mpd->do_map) return false; map->m_lblk = lblk; map->m_len = 1; map->m_flags = bh->b_state & BH_FLAGS; return true; } /* Don't go larger than mballoc is willing to allocate */ if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN) return false; /* Can we merge the block to our big extent? */ if (lblk == map->m_lblk + map->m_len && (bh->b_state & BH_FLAGS) == map->m_flags) { map->m_len++; return true; } return false; } /* * mpage_process_page_bufs - submit page buffers for IO or add them to extent * * @mpd - extent of blocks for mapping * @head - the first buffer in the page * @bh - buffer we should start processing from * @lblk - logical number of the block in the file corresponding to @bh * * Walk through page buffers from @bh upto @head (exclusive) and either submit * the page for IO if all buffers in this page were mapped and there's no * accumulated extent of buffers to map or add buffers in the page to the * extent of buffers to map. The function returns 1 if the caller can continue * by processing the next page, 0 if it should stop adding buffers to the * extent to map because we cannot extend it anymore. It can also return value * < 0 in case of error during IO submission. */ static int mpage_process_page_bufs(struct mpage_da_data *mpd, struct buffer_head *head, struct buffer_head *bh, ext4_lblk_t lblk) { struct inode *inode = mpd->inode; int err; ext4_lblk_t blocks = (i_size_read(inode) + i_blocksize(inode) - 1) >> inode->i_blkbits; if (ext4_verity_in_progress(inode)) blocks = EXT_MAX_BLOCKS; do { BUG_ON(buffer_locked(bh)); if (lblk >= blocks || !mpage_add_bh_to_extent(mpd, lblk, bh)) { /* Found extent to map? */ if (mpd->map.m_len) return 0; /* Buffer needs mapping and handle is not started? */ if (!mpd->do_map) return 0; /* Everything mapped so far and we hit EOF */ break; } } while (lblk++, (bh = bh->b_this_page) != head); /* So far everything mapped? Submit the page for IO. */ if (mpd->map.m_len == 0) { err = mpage_submit_folio(mpd, head->b_folio); if (err < 0) return err; mpage_folio_done(mpd, head->b_folio); } if (lblk >= blocks) { mpd->scanned_until_end = 1; return 0; } return 1; } /* * mpage_process_folio - update folio buffers corresponding to changed extent * and may submit fully mapped page for IO * @mpd: description of extent to map, on return next extent to map * @folio: Contains these buffers. * @m_lblk: logical block mapping. * @m_pblk: corresponding physical mapping. * @map_bh: determines on return whether this page requires any further * mapping or not. * * Scan given folio buffers corresponding to changed extent and update buffer * state according to new extent state. * We map delalloc buffers to their physical location, clear unwritten bits. * If the given folio is not fully mapped, we update @mpd to the next extent in * the given folio that needs mapping & return @map_bh as true. */ static int mpage_process_folio(struct mpage_da_data *mpd, struct folio *folio, ext4_lblk_t *m_lblk, ext4_fsblk_t *m_pblk, bool *map_bh) { struct buffer_head *head, *bh; ext4_io_end_t *io_end = mpd->io_submit.io_end; ext4_lblk_t lblk = *m_lblk; ext4_fsblk_t pblock = *m_pblk; int err = 0; ssize_t io_end_size = 0; struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end); bh = head = folio_buffers(folio); do { if (lblk < mpd->map.m_lblk) continue; if (lblk >= mpd->map.m_lblk + mpd->map.m_len) { /* * Buffer after end of mapped extent. * Find next buffer in the folio to map. */ mpd->map.m_len = 0; mpd->map.m_flags = 0; io_end_vec->size += io_end_size; err = mpage_process_page_bufs(mpd, head, bh, lblk); if (err > 0) err = 0; if (!err && mpd->map.m_len && mpd->map.m_lblk > lblk) { io_end_vec = ext4_alloc_io_end_vec(io_end); if (IS_ERR(io_end_vec)) { err = PTR_ERR(io_end_vec); goto out; } io_end_vec->offset = EXT4_LBLK_TO_B(mpd->inode, mpd->map.m_lblk); } *map_bh = true; goto out; } if (buffer_delay(bh)) { clear_buffer_delay(bh); bh->b_blocknr = pblock++; } clear_buffer_unwritten(bh); io_end_size += i_blocksize(mpd->inode); } while (lblk++, (bh = bh->b_this_page) != head); io_end_vec->size += io_end_size; *map_bh = false; out: *m_lblk = lblk; *m_pblk = pblock; return err; } /* * mpage_map_buffers - update buffers corresponding to changed extent and * submit fully mapped pages for IO * * @mpd - description of extent to map, on return next extent to map * * Scan buffers corresponding to changed extent (we expect corresponding pages * to be already locked) and update buffer state according to new extent state. * We map delalloc buffers to their physical location, clear unwritten bits, * and mark buffers as uninit when we perform writes to unwritten extents * and do extent conversion after IO is finished. If the last page is not fully * mapped, we update @map to the next extent in the last page that needs * mapping. Otherwise we submit the page for IO. */ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) { struct folio_batch fbatch; unsigned nr, i; struct inode *inode = mpd->inode; pgoff_t start, end; ext4_lblk_t lblk; ext4_fsblk_t pblock; int err; bool map_bh = false; start = EXT4_LBLK_TO_PG(inode, mpd->map.m_lblk); end = EXT4_LBLK_TO_PG(inode, mpd->map.m_lblk + mpd->map.m_len - 1); pblock = mpd->map.m_pblk; folio_batch_init(&fbatch); while (start <= end) { nr = filemap_get_folios(inode->i_mapping, &start, end, &fbatch); if (nr == 0) break; for (i = 0; i < nr; i++) { struct folio *folio = fbatch.folios[i]; lblk = EXT4_PG_TO_LBLK(inode, folio->index); err = mpage_process_folio(mpd, folio, &lblk, &pblock, &map_bh); /* * If map_bh is true, means page may require further bh * mapping, or maybe the page was submitted for IO. * So we return to call further extent mapping. */ if (err < 0 || map_bh) goto out; /* Page fully mapped - let IO run! */ err = mpage_submit_folio(mpd, folio); if (err < 0) goto out; mpage_folio_done(mpd, folio); } folio_batch_release(&fbatch); } /* Extent fully mapped and matches with page boundary. We are done. */ mpd->map.m_len = 0; mpd->map.m_flags = 0; return 0; out: folio_batch_release(&fbatch); return err; } static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) { struct inode *inode = mpd->inode; struct ext4_map_blocks *map = &mpd->map; int get_blocks_flags; int err, dioread_nolock; /* Make sure transaction has enough credits for this extent */ err = ext4_journal_ensure_extent_credits(handle, inode); if (err < 0) return err; trace_ext4_da_write_pages_extent(inode, map); /* * Call ext4_map_blocks() to allocate any delayed allocation blocks, or * to convert an unwritten extent to be initialized (in the case * where we have written into one or more preallocated blocks). It is * possible that we're going to need more metadata blocks than * previously reserved. However we must not fail because we're in * writeback and there is nothing we can do about it so it might result * in data loss. So use reserved blocks to allocate metadata if * possible. In addition, do not cache any unrelated extents, as it * only holds the folio lock but does not hold the i_rwsem or * invalidate_lock, which could corrupt the extent status tree. */ get_blocks_flags = EXT4_GET_BLOCKS_CREATE | EXT4_GET_BLOCKS_METADATA_NOFAIL | EXT4_GET_BLOCKS_IO_SUBMIT | EXT4_EX_NOCACHE; dioread_nolock = ext4_should_dioread_nolock(inode); if (dioread_nolock) get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; err = ext4_map_blocks(handle, inode, map, get_blocks_flags); if (err < 0) return err; if (dioread_nolock && (map->m_flags & EXT4_MAP_UNWRITTEN)) { if (!mpd->io_submit.io_end->handle && ext4_handle_valid(handle)) { mpd->io_submit.io_end->handle = handle->h_rsv_handle; handle->h_rsv_handle = NULL; } ext4_set_io_unwritten_flag(mpd->io_submit.io_end); } BUG_ON(map->m_len == 0); return 0; } /* * This is used to submit mapped buffers in a single folio that is not fully * mapped for various reasons, such as insufficient space or journal credits. */ static int mpage_submit_partial_folio(struct mpage_da_data *mpd) { struct inode *inode = mpd->inode; struct folio *folio; loff_t pos; int ret; folio = filemap_get_folio(inode->i_mapping, mpd->start_pos >> PAGE_SHIFT); if (IS_ERR(folio)) return PTR_ERR(folio); /* * The mapped position should be within the current processing folio * but must not be the folio start position. */ pos = ((loff_t)mpd->map.m_lblk) << inode->i_blkbits; if (WARN_ON_ONCE((folio_pos(folio) == pos) || !folio_contains(folio, pos >> PAGE_SHIFT))) return -EINVAL; ret = mpage_submit_folio(mpd, folio); if (ret) goto out; /* * Update start_pos to prevent this folio from being released in * mpage_release_unused_pages(), it will be reset to the aligned folio * pos when this folio is written again in the next round. Additionally, * do not update wbc->nr_to_write here, as it will be updated once the * entire folio has finished processing. */ mpd->start_pos = pos; out: folio_unlock(folio); folio_put(folio); return ret; } /* * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length * mpd->len and submit pages underlying it for IO * * @handle - handle for journal operations * @mpd - extent to map * @give_up_on_write - we set this to true iff there is a fatal error and there * is no hope of writing the data. The caller should discard * dirty pages to avoid infinite loops. * * The function maps extent starting at mpd->lblk of length mpd->len. If it is * delayed, blocks are allocated, if it is unwritten, we may need to convert * them to initialized or split the described range from larger unwritten * extent. Note that we need not map all the described range since allocation * can return less blocks or the range is covered by more unwritten extents. We * cannot map more because we are limited by reserved transaction credits. On * the other hand we always make sure that the last touched page is fully * mapped so that it can be written out (and thus forward progress is * guaranteed). After mapping we submit all mapped pages for IO. */ static int mpage_map_and_submit_extent(handle_t *handle, struct mpage_da_data *mpd, bool *give_up_on_write) { struct inode *inode = mpd->inode; struct ext4_map_blocks *map = &mpd->map; int err; loff_t disksize; int progress = 0; ext4_io_end_t *io_end = mpd->io_submit.io_end; struct ext4_io_end_vec *io_end_vec; io_end_vec = ext4_alloc_io_end_vec(io_end); if (IS_ERR(io_end_vec)) return PTR_ERR(io_end_vec); io_end_vec->offset = EXT4_LBLK_TO_B(inode, map->m_lblk); do { err = mpage_map_one_extent(handle, mpd); if (err < 0) { struct super_block *sb = inode->i_sb; if (ext4_emergency_state(sb)) goto invalidate_dirty_pages; /* * Let the uper layers retry transient errors. * In the case of ENOSPC, if ext4_count_free_blocks() * is non-zero, a commit should free up blocks. */ if ((err == -ENOMEM) || (err == -EAGAIN) || (err == -ENOSPC && ext4_count_free_clusters(sb))) { /* * We may have already allocated extents for * some bhs inside the folio, issue the * corresponding data to prevent stale data. */ if (progress) { if (mpage_submit_partial_folio(mpd)) goto invalidate_dirty_pages; goto update_disksize; } return err; } ext4_msg(sb, KERN_CRIT, "Delayed block allocation failed for " "inode %lu at logical offset %llu with" " max blocks %u with error %d", inode->i_ino, (unsigned long long)map->m_lblk, (unsigned)map->m_len, -err); ext4_msg(sb, KERN_CRIT, "This should not happen!! Data will " "be lost\n"); if (err == -ENOSPC) ext4_print_free_blocks(inode); invalidate_dirty_pages: *give_up_on_write = true; return err; } progress = 1; /* * Update buffer state, submit mapped pages, and get us new * extent to map */ err = mpage_map_and_submit_buffers(mpd); if (err < 0) goto update_disksize; } while (map->m_len); update_disksize: /* * Update on-disk size after IO is submitted. Races with * truncate are avoided by checking i_size under i_data_sem. */ disksize = mpd->start_pos; if (disksize > READ_ONCE(EXT4_I(inode)->i_disksize)) { int err2; loff_t i_size; down_write(&EXT4_I(inode)->i_data_sem); i_size = i_size_read(inode); if (disksize > i_size) disksize = i_size; if (disksize > EXT4_I(inode)->i_disksize) EXT4_I(inode)->i_disksize = disksize; up_write(&EXT4_I(inode)->i_data_sem); err2 = ext4_mark_inode_dirty(handle, inode); if (err2) { ext4_error_err(inode->i_sb, -err2, "Failed to mark inode %lu dirty", inode->i_ino); } if (!err) err = err2; } return err; } static int ext4_journal_folio_buffers(handle_t *handle, struct folio *folio, size_t len) { struct buffer_head *page_bufs = folio_buffers(folio); struct inode *inode = folio->mapping->host; int ret, err; ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, NULL, do_journal_get_write_access); err = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, NULL, write_end_fn); if (ret == 0) ret = err; err = ext4_jbd2_inode_add_write(handle, inode, folio_pos(folio), len); if (ret == 0) ret = err; EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; return ret; } static int mpage_journal_page_buffers(handle_t *handle, struct mpage_da_data *mpd, struct folio *folio) { struct inode *inode = mpd->inode; loff_t size = i_size_read(inode); size_t len = folio_size(folio); folio_clear_checked(folio); mpd->wbc->nr_to_write -= folio_nr_pages(folio); if (folio_pos(folio) + len > size && !ext4_verity_in_progress(inode)) len = size & (len - 1); return ext4_journal_folio_buffers(handle, folio, len); } /* * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages * needing mapping, submit mapped pages * * @mpd - where to look for pages * * Walk dirty pages in the mapping. If they are fully mapped, submit them for * IO immediately. If we cannot map blocks, we submit just already mapped * buffers in the page for IO and keep page dirty. When we can map blocks and * we find a page which isn't mapped we start accumulating extent of buffers * underlying these pages that needs mapping (formed by either delayed or * unwritten buffers). We also lock the pages containing these buffers. The * extent found is returned in @mpd structure (starting at mpd->lblk with * length mpd->len blocks). * * Note that this function can attach bios to one io_end structure which are * neither logically nor physically contiguous. Although it may seem as an * unnecessary complication, it is actually inevitable in blocksize < pagesize * case as we need to track IO to all buffers underlying a page in one io_end. */ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) { struct address_space *mapping = mpd->inode->i_mapping; struct folio_batch fbatch; unsigned int nr_folios; pgoff_t index = mpd->start_pos >> PAGE_SHIFT; pgoff_t end = mpd->end_pos >> PAGE_SHIFT; xa_mark_t tag; int i, err = 0; ext4_lblk_t lblk; struct buffer_head *head; handle_t *handle = NULL; int bpp = ext4_journal_blocks_per_folio(mpd->inode); tag = wbc_to_tag(mpd->wbc); mpd->map.m_len = 0; mpd->next_pos = mpd->start_pos; if (ext4_should_journal_data(mpd->inode)) { handle = ext4_journal_start(mpd->inode, EXT4_HT_WRITE_PAGE, bpp); if (IS_ERR(handle)) return PTR_ERR(handle); } folio_batch_init(&fbatch); while (index <= end) { nr_folios = filemap_get_folios_tag(mapping, &index, end, tag, &fbatch); if (nr_folios == 0) break; for (i = 0; i < nr_folios; i++) { struct folio *folio = fbatch.folios[i]; /* * Accumulated enough dirty pages? This doesn't apply * to WB_SYNC_ALL mode. For integrity sync we have to * keep going because someone may be concurrently * dirtying pages, and we might have synced a lot of * newly appeared dirty pages, but have not synced all * of the old dirty pages. */ if (mpd->wbc->sync_mode == WB_SYNC_NONE && mpd->wbc->nr_to_write <= EXT4_LBLK_TO_PG(mpd->inode, mpd->map.m_len)) goto out; /* If we can't merge this page, we are done. */ if (mpd->map.m_len > 0 && mpd->next_pos != folio_pos(folio)) goto out; if (handle) { err = ext4_journal_ensure_credits(handle, bpp, 0); if (err < 0) goto out; } folio_lock(folio); /* * If the page is no longer dirty, or its mapping no * longer corresponds to inode we are writing (which * means it has been truncated or invalidated), or the * page is already under writeback and we are not doing * a data integrity writeback, skip the page */ if (!folio_test_dirty(folio) || (folio_test_writeback(folio) && (mpd->wbc->sync_mode == WB_SYNC_NONE)) || unlikely(folio->mapping != mapping)) { folio_unlock(folio); continue; } folio_wait_writeback(folio); BUG_ON(folio_test_writeback(folio)); /* * Should never happen but for buggy code in * other subsystems that call * set_page_dirty() without properly warning * the file system first. See [1] for more * information. * * [1] https://lore.kernel.org/linux-mm/20180103100430.GE4911@quack2.suse.cz */ if (!folio_buffers(folio)) { ext4_warning_inode(mpd->inode, "page %lu does not have buffers attached", folio->index); folio_clear_dirty(folio); folio_unlock(folio); continue; } if (mpd->map.m_len == 0) mpd->start_pos = folio_pos(folio); mpd->next_pos = folio_next_pos(folio); /* * Writeout when we cannot modify metadata is simple. * Just submit the page. For data=journal mode we * first handle writeout of the page for checkpoint and * only after that handle delayed page dirtying. This * makes sure current data is checkpointed to the final * location before possibly journalling it again which * is desirable when the page is frequently dirtied * through a pin. */ if (!mpd->can_map) { err = mpage_submit_folio(mpd, folio); if (err < 0) goto out; /* Pending dirtying of journalled data? */ if (folio_test_checked(folio)) { err = mpage_journal_page_buffers(handle, mpd, folio); if (err < 0) goto out; mpd->journalled_more_data = 1; } mpage_folio_done(mpd, folio); } else { /* Add all dirty buffers to mpd */ lblk = EXT4_PG_TO_LBLK(mpd->inode, folio->index); head = folio_buffers(folio); err = mpage_process_page_bufs(mpd, head, head, lblk); if (err <= 0) goto out; err = 0; } } folio_batch_release(&fbatch); cond_resched(); } mpd->scanned_until_end = 1; if (handle) ext4_journal_stop(handle); return 0; out: folio_batch_release(&fbatch); if (handle) ext4_journal_stop(handle); return err; } static int ext4_do_writepages(struct mpage_da_data *mpd) { struct writeback_control *wbc = mpd->wbc; pgoff_t writeback_index = 0; long nr_to_write = wbc->nr_to_write; int range_whole = 0; int cycled = 1; handle_t *handle = NULL; struct inode *inode = mpd->inode; struct address_space *mapping = inode->i_mapping; int needed_blocks, rsv_blocks = 0, ret = 0; struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); struct blk_plug plug; bool give_up_on_write = false; trace_ext4_writepages(inode, wbc); /* * No pages to write? This is mainly a kludge to avoid starting * a transaction for special inodes like journal inode on last iput() * because that could violate lock ordering on umount */ if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) goto out_writepages; /* * If the filesystem has aborted, it is read-only, so return * right away instead of dumping stack traces later on that * will obscure the real source of the problem. We test * fs shutdown state instead of sb->s_flag's SB_RDONLY because * the latter could be true if the filesystem is mounted * read-only, and in that case, ext4_writepages should * *never* be called, so if that ever happens, we would want * the stack trace. */ ret = ext4_emergency_state(mapping->host->i_sb); if (unlikely(ret)) goto out_writepages; /* * If we have inline data and arrive here, it means that * we will soon create the block for the 1st page, so * we'd better clear the inline data here. */ if (ext4_has_inline_data(inode)) { /* Just inode will be modified... */ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out_writepages; } BUG_ON(ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)); ext4_destroy_inline_data(handle, inode); ext4_journal_stop(handle); } /* * data=journal mode does not do delalloc so we just need to writeout / * journal already mapped buffers. On the other hand we need to commit * transaction to make data stable. We expect all the data to be * already in the journal (the only exception are DMA pinned pages * dirtied behind our back) so we commit transaction here and run the * writeback loop to checkpoint them. The checkpointing is not actually * necessary to make data persistent *but* quite a few places (extent * shifting operations, fsverity, ...) depend on being able to drop * pagecache pages after calling filemap_write_and_wait() and for that * checkpointing needs to happen. */ if (ext4_should_journal_data(inode)) { mpd->can_map = 0; if (wbc->sync_mode == WB_SYNC_ALL) ext4_fc_commit(sbi->s_journal, EXT4_I(inode)->i_datasync_tid); } mpd->journalled_more_data = 0; if (ext4_should_dioread_nolock(inode)) { int bpf = ext4_journal_blocks_per_folio(inode); /* * We may need to convert up to one extent per block in * the folio and we may dirty the inode. */ rsv_blocks = 1 + ext4_ext_index_trans_blocks(inode, bpf); } if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; if (writeback_index) cycled = 0; mpd->start_pos = writeback_index << PAGE_SHIFT; mpd->end_pos = LLONG_MAX; } else { mpd->start_pos = wbc->range_start; mpd->end_pos = wbc->range_end; } ext4_io_submit_init(&mpd->io_submit, wbc); retry: if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, mpd->start_pos >> PAGE_SHIFT, mpd->end_pos >> PAGE_SHIFT); blk_start_plug(&plug); /* * First writeback pages that don't need mapping - we can avoid * starting a transaction unnecessarily and also avoid being blocked * in the block layer on device congestion while having transaction * started. */ mpd->do_map = 0; mpd->scanned_until_end = 0; mpd->io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); if (!mpd->io_submit.io_end) { ret = -ENOMEM; goto unplug; } ret = mpage_prepare_extent_to_map(mpd); /* Unlock pages we didn't use */ mpage_release_unused_pages(mpd, false); /* Submit prepared bio */ ext4_io_submit(&mpd->io_submit); ext4_put_io_end_defer(mpd->io_submit.io_end); mpd->io_submit.io_end = NULL; if (ret < 0) goto unplug; while (!mpd->scanned_until_end && wbc->nr_to_write > 0) { /* For each extent of pages we use new io_end */ mpd->io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); if (!mpd->io_submit.io_end) { ret = -ENOMEM; break; } WARN_ON_ONCE(!mpd->can_map); /* * We have two constraints: We find one extent to map and we * must always write out whole page (makes a difference when * blocksize < pagesize) so that we don't block on IO when we * try to write out the rest of the page. Journalled mode is * not supported by delalloc. */ BUG_ON(ext4_should_journal_data(inode)); /* * Calculate the number of credits needed to reserve for one * extent of up to MAX_WRITEPAGES_EXTENT_LEN blocks. It will * attempt to extend the transaction or start a new iteration * if the reserved credits are insufficient. */ needed_blocks = ext4_chunk_trans_blocks(inode, MAX_WRITEPAGES_EXTENT_LEN); /* start a new transaction */ handle = ext4_journal_start_with_reserve(inode, EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks); if (IS_ERR(handle)) { ret = PTR_ERR(handle); ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " "%ld pages, ino %lu; err %d", __func__, wbc->nr_to_write, inode->i_ino, ret); /* Release allocated io_end */ ext4_put_io_end(mpd->io_submit.io_end); mpd->io_submit.io_end = NULL; break; } mpd->do_map = 1; trace_ext4_da_write_folios_start(inode, mpd->start_pos, mpd->next_pos, wbc); ret = mpage_prepare_extent_to_map(mpd); if (!ret && mpd->map.m_len) ret = mpage_map_and_submit_extent(handle, mpd, &give_up_on_write); /* * Caution: If the handle is synchronous, * ext4_journal_stop() can wait for transaction commit * to finish which may depend on writeback of pages to * complete or on page lock to be released. In that * case, we have to wait until after we have * submitted all the IO, released page locks we hold, * and dropped io_end reference (for extent conversion * to be able to complete) before stopping the handle. */ if (!ext4_handle_valid(handle) || handle->h_sync == 0) { ext4_journal_stop(handle); handle = NULL; mpd->do_map = 0; } /* Unlock pages we didn't use */ mpage_release_unused_pages(mpd, give_up_on_write); /* Submit prepared bio */ ext4_io_submit(&mpd->io_submit); /* * Drop our io_end reference we got from init. We have * to be careful and use deferred io_end finishing if * we are still holding the transaction as we can * release the last reference to io_end which may end * up doing unwritten extent conversion. */ if (handle) { ext4_put_io_end_defer(mpd->io_submit.io_end); ext4_journal_stop(handle); } else ext4_put_io_end(mpd->io_submit.io_end); mpd->io_submit.io_end = NULL; trace_ext4_da_write_folios_end(inode, mpd->start_pos, mpd->next_pos, wbc, ret); if (ret == -ENOSPC && sbi->s_journal) { /* * Commit the transaction which would * free blocks released in the transaction * and try again */ jbd2_journal_force_commit_nested(sbi->s_journal); ret = 0; continue; } if (ret == -EAGAIN) ret = 0; /* Fatal error - ENOMEM, EIO... */ if (ret) break; } unplug: blk_finish_plug(&plug); if (!ret && !cycled && wbc->nr_to_write > 0) { cycled = 1; mpd->end_pos = (writeback_index << PAGE_SHIFT) - 1; mpd->start_pos = 0; goto retry; } /* Update index */ if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) /* * Set the writeback_index so that range_cyclic * mode will write it back later */ mapping->writeback_index = mpd->start_pos >> PAGE_SHIFT; out_writepages: trace_ext4_writepages_result(inode, wbc, ret, nr_to_write - wbc->nr_to_write); return ret; } static int ext4_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct super_block *sb = mapping->host->i_sb; struct mpage_da_data mpd = { .inode = mapping->host, .wbc = wbc, .can_map = 1, }; int ret; int alloc_ctx; ret = ext4_emergency_state(sb); if (unlikely(ret)) return ret; alloc_ctx = ext4_writepages_down_read(sb); ret = ext4_do_writepages(&mpd); /* * For data=journal writeback we could have come across pages marked * for delayed dirtying (PageChecked) which were just added to the * running transaction. Try once more to get them to stable storage. */ if (!ret && mpd.journalled_more_data) ret = ext4_do_writepages(&mpd); ext4_writepages_up_read(sb, alloc_ctx); return ret; } int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode) { struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, .range_start = jinode->i_dirty_start, .range_end = jinode->i_dirty_end, }; struct mpage_da_data mpd = { .inode = jinode->i_vfs_inode, .wbc = &wbc, .can_map = 0, }; return ext4_do_writepages(&mpd); } static int ext4_dax_writepages(struct address_space *mapping, struct writeback_control *wbc) { int ret; long nr_to_write = wbc->nr_to_write; struct inode *inode = mapping->host; int alloc_ctx; ret = ext4_emergency_state(inode->i_sb); if (unlikely(ret)) return ret; alloc_ctx = ext4_writepages_down_read(inode->i_sb); trace_ext4_writepages(inode, wbc); ret = dax_writeback_mapping_range(mapping, EXT4_SB(inode->i_sb)->s_daxdev, wbc); trace_ext4_writepages_result(inode, wbc, ret, nr_to_write - wbc->nr_to_write); ext4_writepages_up_read(inode->i_sb, alloc_ctx); return ret; } static int ext4_nonda_switch(struct super_block *sb) { s64 free_clusters, dirty_clusters; struct ext4_sb_info *sbi = EXT4_SB(sb); /* * switch to non delalloc mode if we are running low * on free block. The free block accounting via percpu * counters can get slightly wrong with percpu_counter_batch getting * accumulated on each CPU without updating global counters * Delalloc need an accurate free block accounting. So switch * to non delalloc when we are near to error range. */ free_clusters = percpu_counter_read_positive(&sbi->s_freeclusters_counter); dirty_clusters = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter); /* * Start pushing delalloc when 1/2 of free blocks are dirty. */ if (dirty_clusters && (free_clusters < 2 * dirty_clusters)) try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE); if (2 * free_clusters < 3 * dirty_clusters || free_clusters < (dirty_clusters + EXT4_FREECLUSTERS_WATERMARK)) { /* * free block count is less than 150% of dirty blocks * or free blocks is less than watermark */ return 1; } return 0; } static int ext4_da_write_begin(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { int ret, retries = 0; struct folio *folio; pgoff_t index; struct inode *inode = mapping->host; ret = ext4_emergency_state(inode->i_sb); if (unlikely(ret)) return ret; index = pos >> PAGE_SHIFT; if (ext4_nonda_switch(inode->i_sb) || ext4_verity_in_progress(inode)) { *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; return ext4_write_begin(iocb, mapping, pos, len, foliop, fsdata); } *fsdata = (void *)0; trace_ext4_da_write_begin(inode, pos, len); if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { ret = ext4_generic_write_inline_data(mapping, inode, pos, len, foliop, fsdata, true); if (ret < 0) return ret; if (ret == 1) return 0; } retry: folio = write_begin_get_folio(iocb, mapping, index, len); if (IS_ERR(folio)) return PTR_ERR(folio); if (len > folio_next_pos(folio) - pos) len = folio_next_pos(folio) - pos; ret = ext4_block_write_begin(NULL, folio, pos, len, ext4_da_get_block_prep); if (ret < 0) { folio_unlock(folio); folio_put(folio); /* * ext4_block_write_begin may have instantiated a few blocks * outside i_size. Trim these off again. Don't need * i_size_read because we hold inode lock. */ if (pos + len > inode->i_size) ext4_truncate_failed_write(inode); if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; return ret; } *foliop = folio; return ret; } /* * Check if we should update i_disksize * when write to the end of file but not require block allocation */ static int ext4_da_should_update_i_disksize(struct folio *folio, unsigned long offset) { struct buffer_head *bh; struct inode *inode = folio->mapping->host; unsigned int idx; int i; bh = folio_buffers(folio); idx = offset >> inode->i_blkbits; for (i = 0; i < idx; i++) bh = bh->b_this_page; if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh)) return 0; return 1; } static int ext4_da_do_write_end(struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio) { struct inode *inode = mapping->host; loff_t old_size = inode->i_size; bool disksize_changed = false; loff_t new_i_size, zero_len = 0; handle_t *handle; if (unlikely(!folio_buffers(folio))) { folio_unlock(folio); folio_put(folio); return -EIO; } /* * block_write_end() will mark the inode as dirty with I_DIRTY_PAGES * flag, which all that's needed to trigger page writeback. */ copied = block_write_end(pos, len, copied, folio); new_i_size = pos + copied; /* * It's important to update i_size while still holding folio lock, * because folio writeout could otherwise come in and zero beyond * i_size. * * Since we are holding inode lock, we are sure i_disksize <= * i_size. We also know that if i_disksize < i_size, there are * delalloc writes pending in the range up to i_size. If the end of * the current write is <= i_size, there's no need to touch * i_disksize since writeback will push i_disksize up to i_size * eventually. If the end of the current write is > i_size and * inside an allocated block which ext4_da_should_update_i_disksize() * checked, we need to update i_disksize here as certain * ext4_writepages() paths not allocating blocks and update i_disksize. */ if (new_i_size > inode->i_size) { unsigned long end; i_size_write(inode, new_i_size); end = offset_in_folio(folio, new_i_size - 1); if (copied && ext4_da_should_update_i_disksize(folio, end)) { ext4_update_i_disksize(inode, new_i_size); disksize_changed = true; } } folio_unlock(folio); folio_put(folio); if (pos > old_size) { pagecache_isize_extended(inode, old_size, pos); zero_len = pos - old_size; } if (!disksize_changed && !zero_len) return copied; handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) return PTR_ERR(handle); if (zero_len) ext4_zero_partial_blocks(handle, inode, old_size, zero_len); ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); return copied; } static int ext4_da_write_end(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; int write_mode = (int)(unsigned long)fsdata; if (write_mode == FALL_BACK_TO_NONDELALLOC) return ext4_write_end(iocb, mapping, pos, len, copied, folio, fsdata); trace_ext4_da_write_end(inode, pos, len, copied); if (write_mode != CONVERT_INLINE_DATA && ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) && ext4_has_inline_data(inode)) return ext4_write_inline_data_end(inode, pos, len, copied, folio); if (unlikely(copied < len) && !folio_test_uptodate(folio)) copied = 0; return ext4_da_do_write_end(mapping, pos, len, copied, folio); } /* * Force all delayed allocation blocks to be allocated for a given inode. */ int ext4_alloc_da_blocks(struct inode *inode) { trace_ext4_alloc_da_blocks(inode); if (!EXT4_I(inode)->i_reserved_data_blocks) return 0; /* * We do something simple for now. The filemap_flush() will * also start triggering a write of the data blocks, which is * not strictly speaking necessary (and for users of * laptop_mode, not even desirable). However, to do otherwise * would require replicating code paths in: * * ext4_writepages() -> * write_cache_pages() ---> (via passed in callback function) * __mpage_da_writepage() --> * mpage_add_bh_to_extent() * mpage_da_map_blocks() * * The problem is that write_cache_pages(), located in * mm/page-writeback.c, marks pages clean in preparation for * doing I/O, which is not desirable if we're not planning on * doing I/O at all. * * We could call write_cache_pages(), and then redirty all of * the pages by calling redirty_page_for_writepage() but that * would be ugly in the extreme. So instead we would need to * replicate parts of the code in the above functions, * simplifying them because we wouldn't actually intend to * write out the pages, but rather only collect contiguous * logical block extents, call the multi-block allocator, and * then update the buffer heads with the block allocations. * * For now, though, we'll cheat by calling filemap_flush(), * which will map the blocks, and start the I/O, but not * actually wait for the I/O to complete. */ return filemap_flush(inode->i_mapping); } /* * bmap() is special. It gets used by applications such as lilo and by * the swapper to find the on-disk block of a specific piece of data. * * Naturally, this is dangerous if the block concerned is still in the * journal. If somebody makes a swapfile on an ext4 data-journaling * filesystem and enables swap, then they may get a nasty shock when the * data getting swapped to that swapfile suddenly gets overwritten by * the original zero's written out previously to the journal and * awaiting writeback in the kernel's buffer cache. * * So, if we see any bmap calls here on a modified, data-journaled file, * take extra steps to flush any blocks which might be in the cache. */ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) { struct inode *inode = mapping->host; sector_t ret = 0; inode_lock_shared(inode); /* * We can get here for an inline file via the FIBMAP ioctl */ if (ext4_has_inline_data(inode)) goto out; if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && (test_opt(inode->i_sb, DELALLOC) || ext4_should_journal_data(inode))) { /* * With delalloc or journalled data we want to sync the file so * that we can make sure we allocate blocks for file and data * is in place for the user to see it */ filemap_write_and_wait(mapping); } ret = iomap_bmap(mapping, block, &ext4_iomap_ops); out: inode_unlock_shared(inode); return ret; } static int ext4_read_folio(struct file *file, struct folio *folio) { int ret = -EAGAIN; struct inode *inode = folio->mapping->host; trace_ext4_read_folio(inode, folio); if (ext4_has_inline_data(inode)) ret = ext4_readpage_inline(inode, folio); if (ret == -EAGAIN) return ext4_mpage_readpages(inode, NULL, folio); return ret; } static void ext4_readahead(struct readahead_control *rac) { struct inode *inode = rac->mapping->host; /* If the file has inline data, no need to do readahead. */ if (ext4_has_inline_data(inode)) return; ext4_mpage_readpages(inode, rac, NULL); } static void ext4_invalidate_folio(struct folio *folio, size_t offset, size_t length) { trace_ext4_invalidate_folio(folio, offset, length); /* No journalling happens on data buffers when this function is used */ WARN_ON(folio_buffers(folio) && buffer_jbd(folio_buffers(folio))); block_invalidate_folio(folio, offset, length); } static int __ext4_journalled_invalidate_folio(struct folio *folio, size_t offset, size_t length) { journal_t *journal = EXT4_JOURNAL(folio->mapping->host); trace_ext4_journalled_invalidate_folio(folio, offset, length); /* * If it's a full truncate we just forget about the pending dirtying */ if (offset == 0 && length == folio_size(folio)) folio_clear_checked(folio); return jbd2_journal_invalidate_folio(journal, folio, offset, length); } /* Wrapper for aops... */ static void ext4_journalled_invalidate_folio(struct folio *folio, size_t offset, size_t length) { WARN_ON(__ext4_journalled_invalidate_folio(folio, offset, length) < 0); } static bool ext4_release_folio(struct folio *folio, gfp_t wait) { struct inode *inode = folio->mapping->host; journal_t *journal = EXT4_JOURNAL(inode); trace_ext4_release_folio(inode, folio); /* Page has dirty journalled data -> cannot release */ if (folio_test_checked(folio)) return false; if (journal) return jbd2_journal_try_to_free_buffers(journal, folio); else return try_to_free_buffers(folio); } static bool ext4_inode_datasync_dirty(struct inode *inode) { journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; if (journal) { if (jbd2_transaction_committed(journal, EXT4_I(inode)->i_datasync_tid)) return false; if (test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT)) return !list_empty(&EXT4_I(inode)->i_fc_list); return true; } /* Any metadata buffers to write? */ if (!list_empty(&inode->i_mapping->i_private_list)) return true; return inode_state_read_once(inode) & I_DIRTY_DATASYNC; } static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, struct ext4_map_blocks *map, loff_t offset, loff_t length, unsigned int flags) { u8 blkbits = inode->i_blkbits; /* * Writes that span EOF might trigger an I/O size update on completion, * so consider them to be dirty for the purpose of O_DSYNC, even if * there is no other metadata changes being made or are pending. */ iomap->flags = 0; if (ext4_inode_datasync_dirty(inode) || offset + length > i_size_read(inode)) iomap->flags |= IOMAP_F_DIRTY; if (map->m_flags & EXT4_MAP_NEW) iomap->flags |= IOMAP_F_NEW; /* HW-offload atomics are always used */ if (flags & IOMAP_ATOMIC) iomap->flags |= IOMAP_F_ATOMIC_BIO; if (flags & IOMAP_DAX) iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; else iomap->bdev = inode->i_sb->s_bdev; iomap->offset = EXT4_LBLK_TO_B(inode, map->m_lblk); iomap->length = EXT4_LBLK_TO_B(inode, map->m_len); if ((map->m_flags & EXT4_MAP_MAPPED) && !ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) iomap->flags |= IOMAP_F_MERGED; /* * Flags passed to ext4_map_blocks() for direct I/O writes can result * in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits * set. In order for any allocated unwritten extents to be converted * into written extents correctly within the ->end_io() handler, we * need to ensure that the iomap->type is set appropriately. Hence, the * reason why we need to check whether the EXT4_MAP_UNWRITTEN bit has * been set first. */ if (map->m_flags & EXT4_MAP_UNWRITTEN) { iomap->type = IOMAP_UNWRITTEN; iomap->addr = (u64) map->m_pblk << blkbits; if (flags & IOMAP_DAX) iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off; } else if (map->m_flags & EXT4_MAP_MAPPED) { iomap->type = IOMAP_MAPPED; iomap->addr = (u64) map->m_pblk << blkbits; if (flags & IOMAP_DAX) iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off; } else if (map->m_flags & EXT4_MAP_DELAYED) { iomap->type = IOMAP_DELALLOC; iomap->addr = IOMAP_NULL_ADDR; } else { iomap->type = IOMAP_HOLE; iomap->addr = IOMAP_NULL_ADDR; } } static int ext4_map_blocks_atomic_write_slow(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map) { ext4_lblk_t m_lblk = map->m_lblk; unsigned int m_len = map->m_len; unsigned int mapped_len = 0, m_flags = 0; ext4_fsblk_t next_pblk = 0; bool check_next_pblk = false; int ret = 0; WARN_ON_ONCE(!ext4_has_feature_bigalloc(inode->i_sb)); /* * This is a slow path in case of mixed mapping. We use * EXT4_GET_BLOCKS_CREATE_ZERO flag here to make sure we get a single * contiguous mapped mapping. This will ensure any unwritten or hole * regions within the requested range is zeroed out and we return * a single contiguous mapped extent. */ m_flags = EXT4_GET_BLOCKS_CREATE_ZERO; do { ret = ext4_map_blocks(handle, inode, map, m_flags); if (ret < 0 && ret != -ENOSPC) goto out_err; /* * This should never happen, but let's return an error code to * avoid an infinite loop in here. */ if (ret == 0) { ret = -EFSCORRUPTED; ext4_warning_inode(inode, "ext4_map_blocks() couldn't allocate blocks m_flags: 0x%x, ret:%d", m_flags, ret); goto out_err; } /* * With bigalloc we should never get ENOSPC nor discontiguous * physical extents. */ if ((check_next_pblk && next_pblk != map->m_pblk) || ret == -ENOSPC) { ext4_warning_inode(inode, "Non-contiguous allocation detected: expected %llu, got %llu, " "or ext4_map_blocks() returned out of space ret: %d", next_pblk, map->m_pblk, ret); ret = -EFSCORRUPTED; goto out_err; } next_pblk = map->m_pblk + map->m_len; check_next_pblk = true; mapped_len += map->m_len; map->m_lblk += map->m_len; map->m_len = m_len - mapped_len; } while (mapped_len < m_len); /* * We might have done some work in above loop, so we need to query the * start of the physical extent, based on the origin m_lblk and m_len. * Let's also ensure we were able to allocate the required range for * mixed mapping case. */ map->m_lblk = m_lblk; map->m_len = m_len; map->m_flags = 0; ret = ext4_map_blocks(handle, inode, map, EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF); if (ret != m_len) { ext4_warning_inode(inode, "allocation failed for atomic write request m_lblk:%u, m_len:%u, ret:%d\n", m_lblk, m_len, ret); ret = -EINVAL; } return ret; out_err: /* reset map before returning an error */ map->m_lblk = m_lblk; map->m_len = m_len; map->m_flags = 0; return ret; } /* * ext4_map_blocks_atomic: Helper routine to ensure the entire requested * range in @map [lblk, lblk + len) is one single contiguous extent with no * mixed mappings. * * We first use m_flags passed to us by our caller (ext4_iomap_alloc()). * We only call EXT4_GET_BLOCKS_ZERO in the slow path, when the underlying * physical extent for the requested range does not have a single contiguous * mapping type i.e. (Hole, Mapped, or Unwritten) throughout. * In that case we will loop over the requested range to allocate and zero out * the unwritten / holes in between, to get a single mapped extent from * [m_lblk, m_lblk + m_len). Note that this is only possible because we know * this can be called only with bigalloc enabled filesystem where the underlying * cluster is already allocated. This avoids allocating discontiguous extents * in the slow path due to multiple calls to ext4_map_blocks(). * The slow path is mostly non-performance critical path, so it should be ok to * loop using ext4_map_blocks() with appropriate flags to allocate & zero the * underlying short holes/unwritten extents within the requested range. */ static int ext4_map_blocks_atomic_write(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int m_flags, bool *force_commit) { ext4_lblk_t m_lblk = map->m_lblk; unsigned int m_len = map->m_len; int ret = 0; WARN_ON_ONCE(m_len > 1 && !ext4_has_feature_bigalloc(inode->i_sb)); ret = ext4_map_blocks(handle, inode, map, m_flags); if (ret < 0 || ret == m_len) goto out; /* * This is a mixed mapping case where we were not able to allocate * a single contiguous extent. In that case let's reset requested * mapping and call the slow path. */ map->m_lblk = m_lblk; map->m_len = m_len; map->m_flags = 0; /* * slow path means we have mixed mapping, that means we will need * to force txn commit. */ *force_commit = true; return ext4_map_blocks_atomic_write_slow(handle, inode, map); out: return ret; } static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map, unsigned int flags) { handle_t *handle; int ret, dio_credits, m_flags = 0, retries = 0; bool force_commit = false; /* * Trim the mapping request to the maximum value that we can map at * once for direct I/O. */ if (map->m_len > DIO_MAX_BLOCKS) map->m_len = DIO_MAX_BLOCKS; /* * journal credits estimation for atomic writes. We call * ext4_map_blocks(), to find if there could be a mixed mapping. If yes, * then let's assume the no. of pextents required can be m_len i.e. * every alternate block can be unwritten and hole. */ if (flags & IOMAP_ATOMIC) { unsigned int orig_mlen = map->m_len; ret = ext4_map_blocks(NULL, inode, map, 0); if (ret < 0) return ret; if (map->m_len < orig_mlen) { map->m_len = orig_mlen; dio_credits = ext4_meta_trans_blocks(inode, orig_mlen, map->m_len); } else { dio_credits = ext4_chunk_trans_blocks(inode, map->m_len); } } else { dio_credits = ext4_chunk_trans_blocks(inode, map->m_len); } retry: /* * Either we allocate blocks and then don't get an unwritten extent, so * in that case we have reserved enough credits. Or, the blocks are * already allocated and unwritten. In that case, the extent conversion * fits into the credits as well. */ handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits); if (IS_ERR(handle)) return PTR_ERR(handle); /* * DAX and direct I/O are the only two operations that are currently * supported with IOMAP_WRITE. */ WARN_ON(!(flags & (IOMAP_DAX | IOMAP_DIRECT))); if (flags & IOMAP_DAX) m_flags = EXT4_GET_BLOCKS_CREATE_ZERO; /* * We use i_size instead of i_disksize here because delalloc writeback * can complete at any point during the I/O and subsequently push the * i_disksize out to i_size. This could be beyond where direct I/O is * happening and thus expose allocated blocks to direct I/O reads. */ else if (EXT4_LBLK_TO_B(inode, map->m_lblk) >= i_size_read(inode)) m_flags = EXT4_GET_BLOCKS_CREATE; else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT; if (flags & IOMAP_ATOMIC) ret = ext4_map_blocks_atomic_write(handle, inode, map, m_flags, &force_commit); else ret = ext4_map_blocks(handle, inode, map, m_flags); /* * We cannot fill holes in indirect tree based inodes as that could * expose stale data in the case of a crash. Use the magic error code * to fallback to buffered I/O. */ if (!m_flags && !ret) ret = -ENOTBLK; ext4_journal_stop(handle); if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; /* * Force commit the current transaction if the allocation spans a mixed * mapping range. This ensures any pending metadata updates (like * unwritten to written extents conversion) in this range are in * consistent state with the file data blocks, before performing the * actual write I/O. If the commit fails, the whole I/O must be aborted * to prevent any possible torn writes. */ if (ret > 0 && force_commit) { int ret2; ret2 = ext4_force_commit(inode->i_sb); if (ret2) return ret2; } return ret; } static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned flags, struct iomap *iomap, struct iomap *srcmap) { int ret; struct ext4_map_blocks map; u8 blkbits = inode->i_blkbits; unsigned int orig_mlen; if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK) return -EINVAL; if (WARN_ON_ONCE(ext4_has_inline_data(inode))) return -ERANGE; /* * Calculate the first and last logical blocks respectively. */ map.m_lblk = offset >> blkbits; map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits, EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1; orig_mlen = map.m_len; if (flags & IOMAP_WRITE) { /* * We check here if the blocks are already allocated, then we * don't need to start a journal txn and we can directly return * the mapping information. This could boost performance * especially in multi-threaded overwrite requests. */ if (offset + length <= i_size_read(inode)) { ret = ext4_map_blocks(NULL, inode, &map, 0); /* * For atomic writes the entire requested length should * be mapped. */ if (map.m_flags & EXT4_MAP_MAPPED) { if ((!(flags & IOMAP_ATOMIC) && ret > 0) || (flags & IOMAP_ATOMIC && ret >= orig_mlen)) goto out; } map.m_len = orig_mlen; } ret = ext4_iomap_alloc(inode, &map, flags); } else { /* * This can be called for overwrites path from * ext4_iomap_overwrite_begin(). */ ret = ext4_map_blocks(NULL, inode, &map, 0); } if (ret < 0) return ret; out: /* * When inline encryption is enabled, sometimes I/O to an encrypted file * has to be broken up to guarantee DUN contiguity. Handle this by * limiting the length of the mapping returned. */ map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len); /* * Before returning to iomap, let's ensure the allocated mapping * covers the entire requested length for atomic writes. */ if (flags & IOMAP_ATOMIC) { if (map.m_len < (length >> blkbits)) { WARN_ON_ONCE(1); return -EINVAL; } } ext4_set_iomap(inode, iomap, &map, offset, length, flags); return 0; } static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset, loff_t length, unsigned flags, struct iomap *iomap, struct iomap *srcmap) { int ret; /* * Even for writes we don't need to allocate blocks, so just pretend * we are reading to save overhead of starting a transaction. */ flags &= ~IOMAP_WRITE; ret = ext4_iomap_begin(inode, offset, length, flags, iomap, srcmap); WARN_ON_ONCE(!ret && iomap->type != IOMAP_MAPPED); return ret; } const struct iomap_ops ext4_iomap_ops = { .iomap_begin = ext4_iomap_begin, }; const struct iomap_ops ext4_iomap_overwrite_ops = { .iomap_begin = ext4_iomap_overwrite_begin, }; static int ext4_iomap_begin_report(struct inode *inode, loff_t offset, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) { int ret; struct ext4_map_blocks map; u8 blkbits = inode->i_blkbits; if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK) return -EINVAL; if (ext4_has_inline_data(inode)) { ret = ext4_inline_data_iomap(inode, iomap); if (ret != -EAGAIN) { if (ret == 0 && offset >= iomap->length) ret = -ENOENT; return ret; } } /* * Calculate the first and last logical block respectively. */ map.m_lblk = offset >> blkbits; map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits, EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1; /* * Fiemap callers may call for offset beyond s_bitmap_maxbytes. * So handle it here itself instead of querying ext4_map_blocks(). * Since ext4_map_blocks() will warn about it and will return * -EIO error. */ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); if (offset >= sbi->s_bitmap_maxbytes) { map.m_flags = 0; goto set_iomap; } } ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret < 0) return ret; set_iomap: ext4_set_iomap(inode, iomap, &map, offset, length, flags); return 0; } const struct iomap_ops ext4_iomap_report_ops = { .iomap_begin = ext4_iomap_begin_report, }; /* * For data=journal mode, folio should be marked dirty only when it was * writeably mapped. When that happens, it was already attached to the * transaction and marked as jbddirty (we take care of this in * ext4_page_mkwrite()). On transaction commit, we writeprotect page mappings * so we should have nothing to do here, except for the case when someone * had the page pinned and dirtied the page through this pin (e.g. by doing * direct IO to it). In that case we'd need to attach buffers here to the * transaction but we cannot due to lock ordering. We cannot just dirty the * folio and leave attached buffers clean, because the buffers' dirty state is * "definitive". We cannot just set the buffers dirty or jbddirty because all * the journalling code will explode. So what we do is to mark the folio * "pending dirty" and next time ext4_writepages() is called, attach buffers * to the transaction appropriately. */ static bool ext4_journalled_dirty_folio(struct address_space *mapping, struct folio *folio) { WARN_ON_ONCE(!folio_buffers(folio)); if (folio_maybe_dma_pinned(folio)) folio_set_checked(folio); return filemap_dirty_folio(mapping, folio); } static bool ext4_dirty_folio(struct address_space *mapping, struct folio *folio) { WARN_ON_ONCE(!folio_test_locked(folio) && !folio_test_dirty(folio)); WARN_ON_ONCE(!folio_buffers(folio)); return block_dirty_folio(mapping, folio); } static int ext4_iomap_swap_activate(struct swap_info_struct *sis, struct file *file, sector_t *span) { return iomap_swapfile_activate(sis, file, span, &ext4_iomap_report_ops); } static const struct address_space_operations ext4_aops = { .read_folio = ext4_read_folio, .readahead = ext4_readahead, .writepages = ext4_writepages, .write_begin = ext4_write_begin, .write_end = ext4_write_end, .dirty_folio = ext4_dirty_folio, .bmap = ext4_bmap, .invalidate_folio = ext4_invalidate_folio, .release_folio = ext4_release_folio, .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, .swap_activate = ext4_iomap_swap_activate, }; static const struct address_space_operations ext4_journalled_aops = { .read_folio = ext4_read_folio, .readahead = ext4_readahead, .writepages = ext4_writepages, .write_begin = ext4_write_begin, .write_end = ext4_journalled_write_end, .dirty_folio = ext4_journalled_dirty_folio, .bmap = ext4_bmap, .invalidate_folio = ext4_journalled_invalidate_folio, .release_folio = ext4_release_folio, .migrate_folio = buffer_migrate_folio_norefs, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, .swap_activate = ext4_iomap_swap_activate, }; static const struct address_space_operations ext4_da_aops = { .read_folio = ext4_read_folio, .readahead = ext4_readahead, .writepages = ext4_writepages, .write_begin = ext4_da_write_begin, .write_end = ext4_da_write_end, .dirty_folio = ext4_dirty_folio, .bmap = ext4_bmap, .invalidate_folio = ext4_invalidate_folio, .release_folio = ext4_release_folio, .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, .swap_activate = ext4_iomap_swap_activate, }; static const struct address_space_operations ext4_dax_aops = { .writepages = ext4_dax_writepages, .dirty_folio = noop_dirty_folio, .bmap = ext4_bmap, .swap_activate = ext4_iomap_swap_activate, }; void ext4_set_aops(struct inode *inode) { switch (ext4_inode_journal_mode(inode)) { case EXT4_INODE_ORDERED_DATA_MODE: case EXT4_INODE_WRITEBACK_DATA_MODE: break; case EXT4_INODE_JOURNAL_DATA_MODE: inode->i_mapping->a_ops = &ext4_journalled_aops; return; default: BUG(); } if (IS_DAX(inode)) inode->i_mapping->a_ops = &ext4_dax_aops; else if (test_opt(inode->i_sb, DELALLOC)) inode->i_mapping->a_ops = &ext4_da_aops; else inode->i_mapping->a_ops = &ext4_aops; } /* * Here we can't skip an unwritten buffer even though it usually reads zero * because it might have data in pagecache (eg, if called from ext4_zero_range, * ext4_punch_hole, etc) which needs to be properly zeroed out. Otherwise a * racing writeback can come later and flush the stale pagecache to disk. */ static int __ext4_block_zero_page_range(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length) { unsigned int offset, blocksize, pos; ext4_lblk_t iblock; struct inode *inode = mapping->host; struct buffer_head *bh; struct folio *folio; int err = 0; folio = __filemap_get_folio(mapping, from >> PAGE_SHIFT, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mapping_gfp_constraint(mapping, ~__GFP_FS)); if (IS_ERR(folio)) return PTR_ERR(folio); blocksize = inode->i_sb->s_blocksize; iblock = EXT4_PG_TO_LBLK(inode, folio->index); bh = folio_buffers(folio); if (!bh) bh = create_empty_buffers(folio, blocksize, 0); /* Find the buffer that contains "offset" */ offset = offset_in_folio(folio, from); pos = blocksize; while (offset >= pos) { bh = bh->b_this_page; iblock++; pos += blocksize; } if (buffer_freed(bh)) { BUFFER_TRACE(bh, "freed: skip"); goto unlock; } if (!buffer_mapped(bh)) { BUFFER_TRACE(bh, "unmapped"); ext4_get_block(inode, iblock, bh, 0); /* unmapped? It's a hole - nothing to do */ if (!buffer_mapped(bh)) { BUFFER_TRACE(bh, "still unmapped"); goto unlock; } } /* Ok, it's mapped. Make sure it's up-to-date */ if (folio_test_uptodate(folio)) set_buffer_uptodate(bh); if (!buffer_uptodate(bh)) { err = ext4_read_bh_lock(bh, 0, true); if (err) goto unlock; if (fscrypt_inode_uses_fs_layer_crypto(inode)) { /* We expect the key to be set. */ BUG_ON(!fscrypt_has_encryption_key(inode)); err = fscrypt_decrypt_pagecache_blocks(folio, blocksize, bh_offset(bh)); if (err) { clear_buffer_uptodate(bh); goto unlock; } } } if (ext4_should_journal_data(inode)) { BUFFER_TRACE(bh, "get write access"); err = ext4_journal_get_write_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); if (err) goto unlock; } folio_zero_range(folio, offset, length); BUFFER_TRACE(bh, "zeroed end of block"); if (ext4_should_journal_data(inode)) { err = ext4_dirty_journalled_data(handle, bh); } else { err = 0; mark_buffer_dirty(bh); if (ext4_should_order_data(inode)) err = ext4_jbd2_inode_add_write(handle, inode, from, length); } unlock: folio_unlock(folio); folio_put(folio); return err; } /* * ext4_block_zero_page_range() zeros out a mapping of length 'length' * starting from file offset 'from'. The range to be zero'd must * be contained with in one block. If the specified range exceeds * the end of the block it will be shortened to end of the block * that corresponds to 'from' */ static int ext4_block_zero_page_range(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length) { struct inode *inode = mapping->host; unsigned blocksize = inode->i_sb->s_blocksize; unsigned int max = blocksize - (from & (blocksize - 1)); /* * correct length if it does not fall between * 'from' and the end of the block */ if (length > max || length < 0) length = max; if (IS_DAX(inode)) { return dax_zero_range(inode, from, length, NULL, &ext4_iomap_ops); } return __ext4_block_zero_page_range(handle, mapping, from, length); } /* * ext4_block_truncate_page() zeroes out a mapping from file offset `from' * up to the end of the block which corresponds to `from'. * This required during truncate. We need to physically zero the tail end * of that block so it doesn't yield old data if the file is later grown. */ static int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from) { unsigned length; unsigned blocksize; struct inode *inode = mapping->host; /* If we are processing an encrypted inode during orphan list handling */ if (IS_ENCRYPTED(inode) && !fscrypt_has_encryption_key(inode)) return 0; blocksize = i_blocksize(inode); length = blocksize - (from & (blocksize - 1)); return ext4_block_zero_page_range(handle, mapping, from, length); } int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, loff_t lstart, loff_t length) { struct super_block *sb = inode->i_sb; struct address_space *mapping = inode->i_mapping; unsigned partial_start, partial_end; ext4_fsblk_t start, end; loff_t byte_end = (lstart + length - 1); int err = 0; partial_start = lstart & (sb->s_blocksize - 1); partial_end = byte_end & (sb->s_blocksize - 1); start = lstart >> sb->s_blocksize_bits; end = byte_end >> sb->s_blocksize_bits; /* Handle partial zero within the single block */ if (start == end && (partial_start || (partial_end != sb->s_blocksize - 1))) { err = ext4_block_zero_page_range(handle, mapping, lstart, length); return err; } /* Handle partial zero out on the start of the range */ if (partial_start) { err = ext4_block_zero_page_range(handle, mapping, lstart, sb->s_blocksize); if (err) return err; } /* Handle partial zero out on the end of the range */ if (partial_end != sb->s_blocksize - 1) err = ext4_block_zero_page_range(handle, mapping, byte_end - partial_end, partial_end + 1); return err; } int ext4_can_truncate(struct inode *inode) { if (S_ISREG(inode->i_mode)) return 1; if (S_ISDIR(inode->i_mode)) return 1; if (S_ISLNK(inode->i_mode)) return !ext4_inode_is_fast_symlink(inode); return 0; } /* * We have to make sure i_disksize gets properly updated before we truncate * page cache due to hole punching or zero range. Otherwise i_disksize update * can get lost as it may have been postponed to submission of writeback but * that will never happen if we remove the folio containing i_size from the * page cache. Also if we punch hole within i_size but above i_disksize, * following ext4_page_mkwrite() may mistakenly allocate written blocks over * the hole and thus introduce allocated blocks beyond i_disksize which is * not allowed (e2fsck would complain in case of crash). */ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, loff_t len) { handle_t *handle; int ret; loff_t size = i_size_read(inode); WARN_ON(!inode_is_locked(inode)); if (offset > size) return 0; if (offset + len < size) size = offset + len; if (EXT4_I(inode)->i_disksize >= size) return 0; handle = ext4_journal_start(inode, EXT4_HT_MISC, 1); if (IS_ERR(handle)) return PTR_ERR(handle); ext4_update_i_disksize(inode, size); ret = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); return ret; } static inline void ext4_truncate_folio(struct inode *inode, loff_t start, loff_t end) { unsigned long blocksize = i_blocksize(inode); struct folio *folio; /* Nothing to be done if no complete block needs to be truncated. */ if (round_up(start, blocksize) >= round_down(end, blocksize)) return; folio = filemap_lock_folio(inode->i_mapping, start >> PAGE_SHIFT); if (IS_ERR(folio)) return; if (folio_mkclean(folio)) folio_mark_dirty(folio); folio_unlock(folio); folio_put(folio); } int ext4_truncate_page_cache_block_range(struct inode *inode, loff_t start, loff_t end) { unsigned long blocksize = i_blocksize(inode); int ret; /* * For journalled data we need to write (and checkpoint) pages * before discarding page cache to avoid inconsitent data on disk * in case of crash before freeing or unwritten converting trans * is committed. */ if (ext4_should_journal_data(inode)) { ret = filemap_write_and_wait_range(inode->i_mapping, start, end - 1); if (ret) return ret; goto truncate_pagecache; } /* * If the block size is less than the page size, the file's mapped * blocks within one page could be freed or converted to unwritten. * So it's necessary to remove writable userspace mappings, and then * ext4_page_mkwrite() can be called during subsequent write access * to these partial folios. */ if (!IS_ALIGNED(start | end, PAGE_SIZE) && blocksize < PAGE_SIZE && start < inode->i_size) { loff_t page_boundary = round_up(start, PAGE_SIZE); ext4_truncate_folio(inode, start, min(page_boundary, end)); if (end > page_boundary) ext4_truncate_folio(inode, round_down(end, PAGE_SIZE), end); } truncate_pagecache: truncate_pagecache_range(inode, start, end - 1); return 0; } static void ext4_wait_dax_page(struct inode *inode) { filemap_invalidate_unlock(inode->i_mapping); schedule(); filemap_invalidate_lock(inode->i_mapping); } int ext4_break_layouts(struct inode *inode) { if (WARN_ON_ONCE(!rwsem_is_locked(&inode->i_mapping->invalidate_lock))) return -EINVAL; return dax_break_layout_inode(inode, ext4_wait_dax_page); } /* * ext4_punch_hole: punches a hole in a file by releasing the blocks * associated with the given offset and length * * @inode: File inode * @offset: The offset where the hole will begin * @len: The length of the hole * * Returns: 0 on success or negative on failure */ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) { struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; ext4_lblk_t start_lblk, end_lblk; loff_t max_end = sb->s_maxbytes; loff_t end = offset + length; handle_t *handle; unsigned int credits; int ret; trace_ext4_punch_hole(inode, offset, length, 0); WARN_ON_ONCE(!inode_is_locked(inode)); /* * For indirect-block based inodes, make sure that the hole within * one block before last range. */ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) max_end = EXT4_SB(sb)->s_bitmap_maxbytes - sb->s_blocksize; /* No need to punch hole beyond i_size */ if (offset >= inode->i_size || offset >= max_end) return 0; /* * If the hole extends beyond i_size, set the hole to end after * the block that contains i_size to save pointless tail block zeroing. */ if (end >= inode->i_size) end = round_up(inode->i_size, sb->s_blocksize); if (end > max_end) end = max_end; length = end - offset; /* * Attach jinode to inode for jbd2 if we do any zeroing of partial * block. */ if (!IS_ALIGNED(offset | end, sb->s_blocksize)) { ret = ext4_inode_attach_jinode(inode); if (ret < 0) return ret; } ret = ext4_update_disksize_before_punch(inode, offset, length); if (ret) return ret; /* Now release the pages and zero block aligned part of pages*/ ret = ext4_truncate_page_cache_block_range(inode, offset, end); if (ret) return ret; if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) credits = ext4_chunk_trans_extent(inode, 2); else credits = ext4_blocks_for_truncate(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); ext4_std_error(sb, ret); return ret; } ret = ext4_zero_partial_blocks(handle, inode, offset, length); if (ret) goto out_handle; /* If there are blocks to remove, do it */ start_lblk = EXT4_B_TO_LBLK(inode, offset); end_lblk = end >> inode->i_blkbits; if (end_lblk > start_lblk) { ext4_lblk_t hole_len = end_lblk - start_lblk; ext4_fc_track_inode(handle, inode); ext4_check_map_extents_env(inode); down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); ext4_es_remove_extent(inode, start_lblk, hole_len); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) ret = ext4_ext_remove_space(inode, start_lblk, end_lblk - 1); else ret = ext4_ind_remove_space(handle, inode, start_lblk, end_lblk); if (ret) { up_write(&EXT4_I(inode)->i_data_sem); goto out_handle; } ext4_es_insert_extent(inode, start_lblk, hole_len, ~0, EXTENT_STATUS_HOLE, 0); up_write(&EXT4_I(inode)->i_data_sem); } ext4_fc_track_range(handle, inode, start_lblk, end_lblk); ret = ext4_mark_inode_dirty(handle, inode); if (unlikely(ret)) goto out_handle; ext4_update_inode_fsync_trans(handle, inode, 1); if (IS_SYNC(inode)) ext4_handle_sync(handle); out_handle: ext4_journal_stop(handle); return ret; } int ext4_inode_attach_jinode(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); struct jbd2_inode *jinode; if (ei->jinode || !EXT4_SB(inode->i_sb)->s_journal) return 0; jinode = jbd2_alloc_inode(GFP_KERNEL); spin_lock(&inode->i_lock); if (!ei->jinode) { if (!jinode) { spin_unlock(&inode->i_lock); return -ENOMEM; } ei->jinode = jinode; jbd2_journal_init_jbd_inode(ei->jinode, inode); jinode = NULL; } spin_unlock(&inode->i_lock); if (unlikely(jinode != NULL)) jbd2_free_inode(jinode); return 0; } /* * ext4_truncate() * * We block out ext4_get_block() block instantiations across the entire * transaction, and VFS/VM ensures that ext4_truncate() cannot run * simultaneously on behalf of the same inode. * * As we work through the truncate and commit bits of it to the journal there * is one core, guiding principle: the file's tree must always be consistent on * disk. We must be able to restart the truncate after a crash. * * The file's tree may be transiently inconsistent in memory (although it * probably isn't), but whenever we close off and commit a journal transaction, * the contents of (the filesystem + the journal) must be consistent and * restartable. It's pretty simple, really: bottom up, right to left (although * left-to-right works OK too). * * Note that at recovery time, journal replay occurs *before* the restart of * truncate against the orphan inode list. * * The committed inode has the new, desired i_size (which is the same as * i_disksize in this case). After a crash, ext4_orphan_cleanup() will see * that this inode's truncate did not complete and it will again call * ext4_truncate() to have another go. So there will be instantiated blocks * to the right of the truncation point in a crashed ext4 filesystem. But * that's fine - as long as they are linked from the inode, the post-crash * ext4_truncate() run will find them and release them. */ int ext4_truncate(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); unsigned int credits; int err = 0, err2; handle_t *handle; struct address_space *mapping = inode->i_mapping; /* * There is a possibility that we're either freeing the inode * or it's a completely new inode. In those cases we might not * have i_rwsem locked because it's not necessary. */ if (!(inode_state_read_once(inode) & (I_NEW | I_FREEING))) WARN_ON(!inode_is_locked(inode)); trace_ext4_truncate_enter(inode); if (!ext4_can_truncate(inode)) goto out_trace; if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); if (ext4_has_inline_data(inode)) { int has_inline = 1; err = ext4_inline_data_truncate(inode, &has_inline); if (err || has_inline) goto out_trace; } /* If we zero-out tail of the page, we have to create jinode for jbd2 */ if (inode->i_size & (inode->i_sb->s_blocksize - 1)) { err = ext4_inode_attach_jinode(inode); if (err) goto out_trace; } if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) credits = ext4_chunk_trans_extent(inode, 1); else credits = ext4_blocks_for_truncate(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); if (IS_ERR(handle)) { err = PTR_ERR(handle); goto out_trace; } if (inode->i_size & (inode->i_sb->s_blocksize - 1)) ext4_block_truncate_page(handle, mapping, inode->i_size); /* * We add the inode to the orphan list, so that if this * truncate spans multiple transactions, and we crash, we will * resume the truncate when the filesystem recovers. It also * marks the inode dirty, to catch the new size. * * Implication: the file must always be in a sane, consistent * truncatable state while each transaction commits. */ err = ext4_orphan_add(handle, inode); if (err) goto out_stop; ext4_fc_track_inode(handle, inode); ext4_check_map_extents_env(inode); down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) err = ext4_ext_truncate(handle, inode); else ext4_ind_truncate(handle, inode); up_write(&ei->i_data_sem); if (err) goto out_stop; if (IS_SYNC(inode)) ext4_handle_sync(handle); out_stop: /* * If this was a simple ftruncate() and the file will remain alive, * then we need to clear up the orphan record which we created above. * However, if this was a real unlink then we were called by * ext4_evict_inode(), and we allow that function to clean up the * orphan info for us. */ if (inode->i_nlink) ext4_orphan_del(handle, inode); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); err2 = ext4_mark_inode_dirty(handle, inode); if (unlikely(err2 && !err)) err = err2; ext4_journal_stop(handle); out_trace: trace_ext4_truncate_exit(inode); return err; } static inline u64 ext4_inode_peek_iversion(const struct inode *inode) { if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) return inode_peek_iversion_raw(inode); else return inode_peek_iversion(inode); } static int ext4_inode_blocks_set(struct ext4_inode *raw_inode, struct ext4_inode_info *ei) { struct inode *inode = &(ei->vfs_inode); u64 i_blocks = READ_ONCE(inode->i_blocks); struct super_block *sb = inode->i_sb; if (i_blocks <= ~0U) { /* * i_blocks can be represented in a 32 bit variable * as multiple of 512 bytes */ raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); raw_inode->i_blocks_high = 0; ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); return 0; } /* * This should never happen since sb->s_maxbytes should not have * allowed this, sb->s_maxbytes was set according to the huge_file * feature in ext4_fill_super(). */ if (!ext4_has_feature_huge_file(sb)) return -EFSCORRUPTED; if (i_blocks <= 0xffffffffffffULL) { /* * i_blocks can be represented in a 48 bit variable * as multiple of 512 bytes */ raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); } else { ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE); /* i_block is stored in file system block size */ i_blocks = i_blocks >> (inode->i_blkbits - 9); raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); } return 0; } static int ext4_fill_raw_inode(struct inode *inode, struct ext4_inode *raw_inode) { struct ext4_inode_info *ei = EXT4_I(inode); uid_t i_uid; gid_t i_gid; projid_t i_projid; int block; int err; err = ext4_inode_blocks_set(raw_inode, ei); raw_inode->i_mode = cpu_to_le16(inode->i_mode); i_uid = i_uid_read(inode); i_gid = i_gid_read(inode); i_projid = from_kprojid(&init_user_ns, ei->i_projid); if (!(test_opt(inode->i_sb, NO_UID32))) { raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid)); raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid)); /* * Fix up interoperability with old kernels. Otherwise, * old inodes get re-used with the upper 16 bits of the * uid/gid intact. */ if (ei->i_dtime && !ext4_inode_orphan_tracked(inode)) { raw_inode->i_uid_high = 0; raw_inode->i_gid_high = 0; } else { raw_inode->i_uid_high = cpu_to_le16(high_16_bits(i_uid)); raw_inode->i_gid_high = cpu_to_le16(high_16_bits(i_gid)); } } else { raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid)); raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(i_gid)); raw_inode->i_uid_high = 0; raw_inode->i_gid_high = 0; } raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); EXT4_INODE_SET_CTIME(inode, raw_inode); EXT4_INODE_SET_MTIME(inode, raw_inode); EXT4_INODE_SET_ATIME(inode, raw_inode); EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF); if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) raw_inode->i_file_acl_high = cpu_to_le16(ei->i_file_acl >> 32); raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); ext4_isize_set(raw_inode, ei->i_disksize); raw_inode->i_generation = cpu_to_le32(inode->i_generation); if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { if (old_valid_dev(inode->i_rdev)) { raw_inode->i_block[0] = cpu_to_le32(old_encode_dev(inode->i_rdev)); raw_inode->i_block[1] = 0; } else { raw_inode->i_block[0] = 0; raw_inode->i_block[1] = cpu_to_le32(new_encode_dev(inode->i_rdev)); raw_inode->i_block[2] = 0; } } else if (!ext4_has_inline_data(inode)) { for (block = 0; block < EXT4_N_BLOCKS; block++) raw_inode->i_block[block] = ei->i_data[block]; } if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { u64 ivers = ext4_inode_peek_iversion(inode); raw_inode->i_disk_version = cpu_to_le32(ivers); if (ei->i_extra_isize) { if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) raw_inode->i_version_hi = cpu_to_le32(ivers >> 32); raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); } } if (i_projid != EXT4_DEF_PROJID && !ext4_has_feature_project(inode->i_sb)) err = err ?: -EFSCORRUPTED; if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) raw_inode->i_projid = cpu_to_le32(i_projid); ext4_inode_csum_set(inode, raw_inode, ei); return err; } /* * ext4_get_inode_loc returns with an extra refcount against the inode's * underlying buffer_head on success. If we pass 'inode' and it does not * have in-inode xattr, we have all inode data in memory that is needed * to recreate the on-disk version of this inode. */ static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino, struct inode *inode, struct ext4_iloc *iloc, ext4_fsblk_t *ret_block) { struct ext4_group_desc *gdp; struct buffer_head *bh; ext4_fsblk_t block; struct blk_plug plug; int inodes_per_block, inode_offset; iloc->bh = NULL; if (ino < EXT4_ROOT_INO || ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)) return -EFSCORRUPTED; iloc->block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); gdp = ext4_get_group_desc(sb, iloc->block_group, NULL); if (!gdp) return -EIO; /* * Figure out the offset within the block group inode table */ inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; inode_offset = ((ino - 1) % EXT4_INODES_PER_GROUP(sb)); iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb); block = ext4_inode_table(sb, gdp); if ((block <= le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) || (block >= ext4_blocks_count(EXT4_SB(sb)->s_es))) { ext4_error(sb, "Invalid inode table block %llu in " "block_group %u", block, iloc->block_group); return -EFSCORRUPTED; } block += (inode_offset / inodes_per_block); bh = sb_getblk(sb, block); if (unlikely(!bh)) return -ENOMEM; if (ext4_buffer_uptodate(bh)) goto has_buffer; lock_buffer(bh); if (ext4_buffer_uptodate(bh)) { /* Someone brought it uptodate while we waited */ unlock_buffer(bh); goto has_buffer; } /* * If we have all information of the inode in memory and this * is the only valid inode in the block, we need not read the * block. */ if (inode && !ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { struct buffer_head *bitmap_bh; int i, start; start = inode_offset & ~(inodes_per_block - 1); /* Is the inode bitmap in cache? */ bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp)); if (unlikely(!bitmap_bh)) goto make_io; /* * If the inode bitmap isn't in cache then the * optimisation may end up performing two reads instead * of one, so skip it. */ if (!buffer_uptodate(bitmap_bh)) { brelse(bitmap_bh); goto make_io; } for (i = start; i < start + inodes_per_block; i++) { if (i == inode_offset) continue; if (ext4_test_bit(i, bitmap_bh->b_data)) break; } brelse(bitmap_bh); if (i == start + inodes_per_block) { struct ext4_inode *raw_inode = (struct ext4_inode *) (bh->b_data + iloc->offset); /* all other inodes are free, so skip I/O */ memset(bh->b_data, 0, bh->b_size); if (!ext4_test_inode_state(inode, EXT4_STATE_NEW)) ext4_fill_raw_inode(inode, raw_inode); set_buffer_uptodate(bh); unlock_buffer(bh); goto has_buffer; } } make_io: /* * If we need to do any I/O, try to pre-readahead extra * blocks from the inode table. */ blk_start_plug(&plug); if (EXT4_SB(sb)->s_inode_readahead_blks) { ext4_fsblk_t b, end, table; unsigned num; __u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks; table = ext4_inode_table(sb, gdp); /* s_inode_readahead_blks is always a power of 2 */ b = block & ~((ext4_fsblk_t) ra_blks - 1); if (table > b) b = table; end = b + ra_blks; num = EXT4_INODES_PER_GROUP(sb); if (ext4_has_group_desc_csum(sb)) num -= ext4_itable_unused_count(sb, gdp); table += num / inodes_per_block; if (end > table) end = table; while (b <= end) ext4_sb_breadahead_unmovable(sb, b++); } /* * There are other valid inodes in the buffer, this inode * has in-inode xattrs, or we don't have this inode in memory. * Read the block from disk. */ trace_ext4_load_inode(sb, ino); ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL, ext4_simulate_fail(sb, EXT4_SIM_INODE_EIO)); blk_finish_plug(&plug); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { if (ret_block) *ret_block = block; brelse(bh); return -EIO; } has_buffer: iloc->bh = bh; return 0; } static int __ext4_get_inode_loc_noinmem(struct inode *inode, struct ext4_iloc *iloc) { ext4_fsblk_t err_blk = 0; int ret; ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, NULL, iloc, &err_blk); if (ret == -EIO) ext4_error_inode_block(inode, err_blk, EIO, "unable to read itable block"); return ret; } int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) { ext4_fsblk_t err_blk = 0; int ret; ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, inode, iloc, &err_blk); if (ret == -EIO) ext4_error_inode_block(inode, err_blk, EIO, "unable to read itable block"); return ret; } int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino, struct ext4_iloc *iloc) { return __ext4_get_inode_loc(sb, ino, NULL, iloc, NULL); } static bool ext4_should_enable_dax(struct inode *inode) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); if (test_opt2(inode->i_sb, DAX_NEVER)) return false; if (!S_ISREG(inode->i_mode)) return false; if (ext4_should_journal_data(inode)) return false; if (ext4_has_inline_data(inode)) return false; if (ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT)) return false; if (ext4_test_inode_flag(inode, EXT4_INODE_VERITY)) return false; if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags)) return false; if (test_opt(inode->i_sb, DAX_ALWAYS)) return true; return ext4_test_inode_flag(inode, EXT4_INODE_DAX); } void ext4_set_inode_flags(struct inode *inode, bool init) { unsigned int flags = EXT4_I(inode)->i_flags; unsigned int new_fl = 0; WARN_ON_ONCE(IS_DAX(inode) && init); if (flags & EXT4_SYNC_FL) new_fl |= S_SYNC; if (flags & EXT4_APPEND_FL) new_fl |= S_APPEND; if (flags & EXT4_IMMUTABLE_FL) new_fl |= S_IMMUTABLE; if (flags & EXT4_NOATIME_FL) new_fl |= S_NOATIME; if (flags & EXT4_DIRSYNC_FL) new_fl |= S_DIRSYNC; /* Because of the way inode_set_flags() works we must preserve S_DAX * here if already set. */ new_fl |= (inode->i_flags & S_DAX); if (init && ext4_should_enable_dax(inode)) new_fl |= S_DAX; if (flags & EXT4_ENCRYPT_FL) new_fl |= S_ENCRYPTED; if (flags & EXT4_CASEFOLD_FL) new_fl |= S_CASEFOLD; if (flags & EXT4_VERITY_FL) new_fl |= S_VERITY; inode_set_flags(inode, new_fl, S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX| S_ENCRYPTED|S_CASEFOLD|S_VERITY); } static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, struct ext4_inode_info *ei) { blkcnt_t i_blocks ; struct inode *inode = &(ei->vfs_inode); struct super_block *sb = inode->i_sb; if (ext4_has_feature_huge_file(sb)) { /* we are using combined 48 bit field */ i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 | le32_to_cpu(raw_inode->i_blocks_lo); if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) { /* i_blocks represent file system block size */ return i_blocks << (inode->i_blkbits - 9); } else { return i_blocks; } } else { return le32_to_cpu(raw_inode->i_blocks_lo); } } static inline int ext4_iget_extra_inode(struct inode *inode, struct ext4_inode *raw_inode, struct ext4_inode_info *ei) { __le32 *magic = (void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize; if (EXT4_INODE_HAS_XATTR_SPACE(inode) && *magic == cpu_to_le32(EXT4_XATTR_MAGIC)) { int err; err = xattr_check_inode(inode, IHDR(inode, raw_inode), ITAIL(inode, raw_inode)); if (err) return err; ext4_set_inode_state(inode, EXT4_STATE_XATTR); err = ext4_find_inline_data_nolock(inode); if (!err && ext4_has_inline_data(inode)) ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); return err; } else EXT4_I(inode)->i_inline_off = 0; return 0; } int ext4_get_projid(struct inode *inode, kprojid_t *projid) { if (!ext4_has_feature_project(inode->i_sb)) return -EOPNOTSUPP; *projid = EXT4_I(inode)->i_projid; return 0; } /* * ext4 has self-managed i_version for ea inodes, it stores the lower 32bit of * refcount in i_version, so use raw values if inode has EXT4_EA_INODE_FL flag * set. */ static inline void ext4_inode_set_iversion_queried(struct inode *inode, u64 val) { if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) inode_set_iversion_raw(inode, val); else inode_set_iversion_queried(inode, val); } static int check_igot_inode(struct inode *inode, ext4_iget_flags flags, const char *function, unsigned int line) { const char *err_str; if (flags & EXT4_IGET_EA_INODE) { if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { err_str = "missing EA_INODE flag"; goto error; } if (ext4_test_inode_state(inode, EXT4_STATE_XATTR) || EXT4_I(inode)->i_file_acl) { err_str = "ea_inode with extended attributes"; goto error; } } else { if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { /* * open_by_handle_at() could provide an old inode number * that has since been reused for an ea_inode; this does * not indicate filesystem corruption */ if (flags & EXT4_IGET_HANDLE) return -ESTALE; err_str = "unexpected EA_INODE flag"; goto error; } } if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD)) { err_str = "unexpected bad inode w/o EXT4_IGET_BAD"; goto error; } return 0; error: ext4_error_inode(inode, function, line, 0, "%s", err_str); return -EFSCORRUPTED; } void ext4_set_inode_mapping_order(struct inode *inode) { struct super_block *sb = inode->i_sb; u16 min_order, max_order; max_order = EXT4_SB(sb)->s_max_folio_order; if (!max_order) return; min_order = EXT4_SB(sb)->s_min_folio_order; if (!min_order && !S_ISREG(inode->i_mode)) return; if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) max_order = min_order; mapping_set_folio_order_range(inode->i_mapping, min_order, max_order); } struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ext4_iget_flags flags, const char *function, unsigned int line) { struct ext4_iloc iloc; struct ext4_inode *raw_inode; struct ext4_inode_info *ei; struct ext4_super_block *es = EXT4_SB(sb)->s_es; struct inode *inode; journal_t *journal = EXT4_SB(sb)->s_journal; long ret; loff_t size; int block; uid_t i_uid; gid_t i_gid; projid_t i_projid; if ((!(flags & EXT4_IGET_SPECIAL) && is_special_ino(sb, ino)) || (ino < EXT4_ROOT_INO) || (ino > le32_to_cpu(es->s_inodes_count))) { if (flags & EXT4_IGET_HANDLE) return ERR_PTR(-ESTALE); __ext4_error(sb, function, line, false, EFSCORRUPTED, 0, "inode #%lu: comm %s: iget: illegal inode #", ino, current->comm); return ERR_PTR(-EFSCORRUPTED); } inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); if (!(inode_state_read_once(inode) & I_NEW)) { ret = check_igot_inode(inode, flags, function, line); if (ret) { iput(inode); return ERR_PTR(ret); } return inode; } ei = EXT4_I(inode); iloc.bh = NULL; ret = __ext4_get_inode_loc_noinmem(inode, &iloc); if (ret < 0) goto bad_inode; raw_inode = ext4_raw_inode(&iloc); if ((flags & EXT4_IGET_HANDLE) && (raw_inode->i_links_count == 0) && (raw_inode->i_mode == 0)) { ret = -ESTALE; goto bad_inode; } if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > EXT4_INODE_SIZE(inode->i_sb) || (ei->i_extra_isize & 3)) { ext4_error_inode(inode, function, line, 0, "iget: bad extra_isize %u " "(inode size %u)", ei->i_extra_isize, EXT4_INODE_SIZE(inode->i_sb)); ret = -EFSCORRUPTED; goto bad_inode; } } else ei->i_extra_isize = 0; /* Precompute checksum seed for inode metadata */ if (ext4_has_feature_metadata_csum(sb)) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 csum; __le32 inum = cpu_to_le32(inode->i_ino); __le32 gen = raw_inode->i_generation; csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen)); } if ((!ext4_inode_csum_verify(inode, raw_inode, ei) || ext4_simulate_fail(sb, EXT4_SIM_INODE_CRC)) && (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))) { ext4_error_inode_err(inode, function, line, 0, EFSBADCRC, "iget: checksum invalid"); ret = -EFSBADCRC; goto bad_inode; } inode->i_mode = le16_to_cpu(raw_inode->i_mode); i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); if (ext4_has_feature_project(sb) && EXT4_INODE_SIZE(sb) > EXT4_GOOD_OLD_INODE_SIZE && EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) i_projid = (projid_t)le32_to_cpu(raw_inode->i_projid); else i_projid = EXT4_DEF_PROJID; if (!(test_opt(inode->i_sb, NO_UID32))) { i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; } i_uid_write(inode, i_uid); i_gid_write(inode, i_gid); ei->i_projid = make_kprojid(&init_user_ns, i_projid); set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); ei->i_inline_off = 0; ei->i_dir_start_lookup = 0; ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); /* We now have enough fields to check if the inode was active or not. * This is needed because nfsd might try to access dead inodes * the test is that same one that e2fsck uses * NeilBrown 1999oct15 */ if (inode->i_nlink == 0) { if ((inode->i_mode == 0 || flags & EXT4_IGET_SPECIAL || !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) && ino != EXT4_BOOT_LOADER_INO) { /* this inode is deleted or unallocated */ if (flags & EXT4_IGET_SPECIAL) { ext4_error_inode(inode, function, line, 0, "iget: special inode unallocated"); ret = -EFSCORRUPTED; } else ret = -ESTALE; goto bad_inode; } /* The only unlinked inodes we let through here have * valid i_mode and are being read by the orphan * recovery code: that's fine, we're about to complete * the process of deleting those. * OR it is the EXT4_BOOT_LOADER_INO which is * not initialized on a new filesystem. */ } ei->i_flags = le32_to_cpu(raw_inode->i_flags); ext4_set_inode_flags(inode, true); /* Detect invalid flag combination - can't have both inline data and extents */ if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) && ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { ext4_error_inode(inode, function, line, 0, "inode has both inline data and extents flags"); ret = -EFSCORRUPTED; goto bad_inode; } inode->i_blocks = ext4_inode_blocks(raw_inode, ei); ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo); if (ext4_has_feature_64bit(sb)) ei->i_file_acl |= ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; inode->i_size = ext4_isize(sb, raw_inode); size = i_size_read(inode); if (size < 0 || size > ext4_get_maxbytes(inode)) { ext4_error_inode(inode, function, line, 0, "iget: bad i_size value: %lld", size); ret = -EFSCORRUPTED; goto bad_inode; } /* * If dir_index is not enabled but there's dir with INDEX flag set, * we'd normally treat htree data as empty space. But with metadata * checksumming that corrupts checksums so forbid that. */ if (!ext4_has_feature_dir_index(sb) && ext4_has_feature_metadata_csum(sb) && ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) { ext4_error_inode(inode, function, line, 0, "iget: Dir with htree data on filesystem without dir_index feature."); ret = -EFSCORRUPTED; goto bad_inode; } ei->i_disksize = inode->i_size; #ifdef CONFIG_QUOTA ei->i_reserved_quota = 0; #endif inode->i_generation = le32_to_cpu(raw_inode->i_generation); ei->i_block_group = iloc.block_group; ei->i_last_alloc_group = ~0; /* * NOTE! The in-memory inode i_data array is in little-endian order * even on big-endian machines: we do NOT byteswap the block numbers! */ for (block = 0; block < EXT4_N_BLOCKS; block++) ei->i_data[block] = raw_inode->i_block[block]; INIT_LIST_HEAD(&ei->i_orphan); ext4_fc_init_inode(&ei->vfs_inode); /* * Set transaction id's of transactions that have to be committed * to finish f[data]sync. We set them to currently running transaction * as we cannot be sure that the inode or some of its metadata isn't * part of the transaction - the inode could have been reclaimed and * now it is reread from disk. */ if (journal) { transaction_t *transaction; tid_t tid; read_lock(&journal->j_state_lock); if (journal->j_running_transaction) transaction = journal->j_running_transaction; else transaction = journal->j_committing_transaction; if (transaction) tid = transaction->t_tid; else tid = journal->j_commit_sequence; read_unlock(&journal->j_state_lock); ei->i_sync_tid = tid; ei->i_datasync_tid = tid; } if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { if (ei->i_extra_isize == 0) { /* The extra space is currently unused. Use it. */ BUILD_BUG_ON(sizeof(struct ext4_inode) & 3); ei->i_extra_isize = sizeof(struct ext4_inode) - EXT4_GOOD_OLD_INODE_SIZE; } else { ret = ext4_iget_extra_inode(inode, raw_inode, ei); if (ret) goto bad_inode; } } EXT4_INODE_GET_CTIME(inode, raw_inode); EXT4_INODE_GET_ATIME(inode, raw_inode); EXT4_INODE_GET_MTIME(inode, raw_inode); EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { u64 ivers = le32_to_cpu(raw_inode->i_disk_version); if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) ivers |= (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; } ext4_inode_set_iversion_queried(inode, ivers); } ret = 0; if (ei->i_file_acl && !ext4_inode_block_valid(inode, ei->i_file_acl, 1)) { ext4_error_inode(inode, function, line, 0, "iget: bad extended attribute block %llu", ei->i_file_acl); ret = -EFSCORRUPTED; goto bad_inode; } else if (!ext4_has_inline_data(inode)) { /* validate the block references in the inode */ if (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) && (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || (S_ISLNK(inode->i_mode) && !ext4_inode_is_fast_symlink(inode)))) { if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) ret = ext4_ext_check_inode(inode); else ret = ext4_ind_check_inode(inode); } } if (ret) goto bad_inode; if (S_ISREG(inode->i_mode)) { inode->i_op = &ext4_file_inode_operations; inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); } else if (S_ISDIR(inode->i_mode)) { inode->i_op = &ext4_dir_inode_operations; inode->i_fop = &ext4_dir_operations; } else if (S_ISLNK(inode->i_mode)) { /* VFS does not allow setting these so must be corruption */ if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { ext4_error_inode(inode, function, line, 0, "iget: immutable or append flags " "not allowed on symlinks"); ret = -EFSCORRUPTED; goto bad_inode; } if (IS_ENCRYPTED(inode)) { inode->i_op = &ext4_encrypted_symlink_inode_operations; } else if (ext4_inode_is_fast_symlink(inode)) { inode->i_op = &ext4_fast_symlink_inode_operations; if (inode->i_size == 0 || inode->i_size >= sizeof(ei->i_data) || strnlen((char *)ei->i_data, inode->i_size + 1) != inode->i_size) { ext4_error_inode(inode, function, line, 0, "invalid fast symlink length %llu", (unsigned long long)inode->i_size); ret = -EFSCORRUPTED; goto bad_inode; } inode_set_cached_link(inode, (char *)ei->i_data, inode->i_size); } else { inode->i_op = &ext4_symlink_inode_operations; } } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { inode->i_op = &ext4_special_inode_operations; if (raw_inode->i_block[0]) init_special_inode(inode, inode->i_mode, old_decode_dev(le32_to_cpu(raw_inode->i_block[0]))); else init_special_inode(inode, inode->i_mode, new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); } else if (ino == EXT4_BOOT_LOADER_INO) { make_bad_inode(inode); } else { ret = -EFSCORRUPTED; ext4_error_inode(inode, function, line, 0, "iget: bogus i_mode (%o)", inode->i_mode); goto bad_inode; } if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb)) { ext4_error_inode(inode, function, line, 0, "casefold flag without casefold feature"); ret = -EFSCORRUPTED; goto bad_inode; } ext4_set_inode_mapping_order(inode); ret = check_igot_inode(inode, flags, function, line); /* * -ESTALE here means there is nothing inherently wrong with the inode, * it's just not an inode we can return for an fhandle lookup. */ if (ret == -ESTALE) { brelse(iloc.bh); unlock_new_inode(inode); iput(inode); return ERR_PTR(-ESTALE); } if (ret) goto bad_inode; brelse(iloc.bh); /* Initialize the "no ACL's" state for the simple cases */ if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) && !ei->i_file_acl) cache_no_acl(inode); unlock_new_inode(inode); return inode; bad_inode: brelse(iloc.bh); iget_failed(inode); return ERR_PTR(ret); } static void __ext4_update_other_inode_time(struct super_block *sb, unsigned long orig_ino, unsigned long ino, struct ext4_inode *raw_inode) { struct inode *inode; inode = find_inode_by_ino_rcu(sb, ino); if (!inode) return; if (!inode_is_dirtytime_only(inode)) return; spin_lock(&inode->i_lock); if (inode_is_dirtytime_only(inode)) { struct ext4_inode_info *ei = EXT4_I(inode); inode_state_clear(inode, I_DIRTY_TIME); spin_unlock(&inode->i_lock); spin_lock(&ei->i_raw_lock); EXT4_INODE_SET_CTIME(inode, raw_inode); EXT4_INODE_SET_MTIME(inode, raw_inode); EXT4_INODE_SET_ATIME(inode, raw_inode); ext4_inode_csum_set(inode, raw_inode, ei); spin_unlock(&ei->i_raw_lock); trace_ext4_other_inode_update_time(inode, orig_ino); return; } spin_unlock(&inode->i_lock); } /* * Opportunistically update the other time fields for other inodes in * the same inode table block. */ static void ext4_update_other_inodes_time(struct super_block *sb, unsigned long orig_ino, char *buf) { unsigned long ino; int i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; int inode_size = EXT4_INODE_SIZE(sb); /* * Calculate the first inode in the inode table block. Inode * numbers are one-based. That is, the first inode in a block * (assuming 4k blocks and 256 byte inodes) is (n*16 + 1). */ ino = ((orig_ino - 1) & ~(inodes_per_block - 1)) + 1; rcu_read_lock(); for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) { if (ino == orig_ino) continue; __ext4_update_other_inode_time(sb, orig_ino, ino, (struct ext4_inode *)buf); } rcu_read_unlock(); } /* * Post the struct inode info into an on-disk inode location in the * buffer-cache. This gobbles the caller's reference to the * buffer_head in the inode location struct. * * The caller must have write access to iloc->bh. */ static int ext4_do_update_inode(handle_t *handle, struct inode *inode, struct ext4_iloc *iloc) { struct ext4_inode *raw_inode = ext4_raw_inode(iloc); struct ext4_inode_info *ei = EXT4_I(inode); struct buffer_head *bh = iloc->bh; struct super_block *sb = inode->i_sb; int err; int need_datasync = 0, set_large_file = 0; spin_lock(&ei->i_raw_lock); /* * For fields not tracked in the in-memory inode, initialise them * to zero for new inodes. */ if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); if (READ_ONCE(ei->i_disksize) != ext4_isize(inode->i_sb, raw_inode)) need_datasync = 1; if (ei->i_disksize > 0x7fffffffULL) { if (!ext4_has_feature_large_file(sb) || EXT4_SB(sb)->s_es->s_rev_level == cpu_to_le32(EXT4_GOOD_OLD_REV)) set_large_file = 1; } err = ext4_fill_raw_inode(inode, raw_inode); spin_unlock(&ei->i_raw_lock); if (err) { EXT4_ERROR_INODE(inode, "corrupted inode contents"); goto out_brelse; } if (inode->i_sb->s_flags & SB_LAZYTIME) ext4_update_other_inodes_time(inode->i_sb, inode->i_ino, bh->b_data); BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, NULL, bh); if (err) goto out_error; ext4_clear_inode_state(inode, EXT4_STATE_NEW); if (set_large_file) { BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access"); err = ext4_journal_get_write_access(handle, sb, EXT4_SB(sb)->s_sbh, EXT4_JTR_NONE); if (err) goto out_error; lock_buffer(EXT4_SB(sb)->s_sbh); ext4_set_feature_large_file(sb); ext4_superblock_csum_set(sb); unlock_buffer(EXT4_SB(sb)->s_sbh); ext4_handle_sync(handle); err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); } ext4_update_inode_fsync_trans(handle, inode, need_datasync); out_error: ext4_std_error(inode->i_sb, err); out_brelse: brelse(bh); return err; } /* * ext4_write_inode() * * We are called from a few places: * * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files. * Here, there will be no transaction running. We wait for any running * transaction to commit. * * - Within flush work (sys_sync(), kupdate and such). * We wait on commit, if told to. * * - Within iput_final() -> write_inode_now() * We wait on commit, if told to. * * In all cases it is actually safe for us to return without doing anything, * because the inode has been copied into a raw inode buffer in * ext4_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL * writeback. * * Note that we are absolutely dependent upon all inode dirtiers doing the * right thing: they *must* call mark_inode_dirty() after dirtying info in * which we are interested. * * It would be a bug for them to not do this. The code: * * mark_inode_dirty(inode) * stuff(); * inode->i_size = expr; * * is in error because write_inode() could occur while `stuff()' is running, * and the new i_size will be lost. Plus the inode will no longer be on the * superblock's dirty inode list. */ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) { int err; if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) return 0; err = ext4_emergency_state(inode->i_sb); if (unlikely(err)) return err; if (EXT4_SB(inode->i_sb)->s_journal) { if (ext4_journal_current_handle()) { ext4_debug("called recursively, non-PF_MEMALLOC!\n"); dump_stack(); return -EIO; } /* * No need to force transaction in WB_SYNC_NONE mode. Also * ext4_sync_fs() will force the commit after everything is * written. */ if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync) return 0; err = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal, EXT4_I(inode)->i_sync_tid); } else { struct ext4_iloc iloc; err = __ext4_get_inode_loc_noinmem(inode, &iloc); if (err) return err; /* * sync(2) will flush the whole buffer cache. No need to do * it here separately for each inode. */ if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) sync_dirty_buffer(iloc.bh); if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { ext4_error_inode_block(inode, iloc.bh->b_blocknr, EIO, "IO error syncing inode"); err = -EIO; } brelse(iloc.bh); } return err; } /* * In data=journal mode ext4_journalled_invalidate_folio() may fail to invalidate * buffers that are attached to a folio straddling i_size and are undergoing * commit. In that case we have to wait for commit to finish and try again. */ static void ext4_wait_for_tail_page_commit(struct inode *inode) { unsigned offset; journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; tid_t commit_tid; int ret; bool has_transaction; offset = inode->i_size & (PAGE_SIZE - 1); /* * If the folio is fully truncated, we don't need to wait for any commit * (and we even should not as __ext4_journalled_invalidate_folio() may * strip all buffers from the folio but keep the folio dirty which can then * confuse e.g. concurrent ext4_writepages() seeing dirty folio without * buffers). Also we don't need to wait for any commit if all buffers in * the folio remain valid. This is most beneficial for the common case of * blocksize == PAGESIZE. */ if (!offset || offset > (PAGE_SIZE - i_blocksize(inode))) return; while (1) { struct folio *folio = filemap_lock_folio(inode->i_mapping, inode->i_size >> PAGE_SHIFT); if (IS_ERR(folio)) return; ret = __ext4_journalled_invalidate_folio(folio, offset, folio_size(folio) - offset); folio_unlock(folio); folio_put(folio); if (ret != -EBUSY) return; has_transaction = false; read_lock(&journal->j_state_lock); if (journal->j_committing_transaction) { commit_tid = journal->j_committing_transaction->t_tid; has_transaction = true; } read_unlock(&journal->j_state_lock); if (has_transaction) jbd2_log_wait_commit(journal, commit_tid); } } /* * ext4_setattr() * * Called from notify_change. * * We want to trap VFS attempts to truncate the file as soon as * possible. In particular, we want to make sure that when the VFS * shrinks i_size, we put the inode on the orphan list and modify * i_disksize immediately, so that during the subsequent flushing of * dirty pages and freeing of disk blocks, we can guarantee that any * commit will leave the blocks being flushed in an unused state on * disk. (On recovery, the inode will get truncated and the blocks will * be freed, so we have a strong guarantee that no future commit will * leave these blocks visible to the user.) * * Another thing we have to assure is that if we are in ordered mode * and inode is still attached to the committing transaction, we must * we start writeout of all the dirty pages which are being truncated. * This way we are sure that all the data written in the previous * transaction are already on disk (truncate waits for pages under * writeback). * * Called with inode->i_rwsem down. */ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); int error, rc = 0; int orphan = 0; const unsigned int ia_valid = attr->ia_valid; bool inc_ivers = true; error = ext4_emergency_state(inode->i_sb); if (unlikely(error)) return error; if (unlikely(IS_IMMUTABLE(inode))) return -EPERM; if (unlikely(IS_APPEND(inode) && (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_TIMES_SET)))) return -EPERM; error = setattr_prepare(idmap, dentry, attr); if (error) return error; error = fscrypt_prepare_setattr(dentry, attr); if (error) return error; error = fsverity_prepare_setattr(dentry, attr); if (error) return error; if (is_quota_modification(idmap, inode, attr)) { error = dquot_initialize(inode); if (error) return error; } if (i_uid_needs_update(idmap, attr, inode) || i_gid_needs_update(idmap, attr, inode)) { handle_t *handle; /* (user+group)*(old+new) structure, inode write (sb, * inode block, ? - but truncate inode update has it) */ handle = ext4_journal_start(inode, EXT4_HT_QUOTA, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) + EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)) + 3); if (IS_ERR(handle)) { error = PTR_ERR(handle); goto err_out; } /* dquot_transfer() calls back ext4_get_inode_usage() which * counts xattr inode references. */ down_read(&EXT4_I(inode)->xattr_sem); error = dquot_transfer(idmap, inode, attr); up_read(&EXT4_I(inode)->xattr_sem); if (error) { ext4_journal_stop(handle); return error; } /* Update corresponding info in inode so that everything is in * one transaction */ i_uid_update(idmap, attr, inode); i_gid_update(idmap, attr, inode); error = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); if (unlikely(error)) { return error; } } if (attr->ia_valid & ATTR_SIZE) { handle_t *handle; loff_t oldsize = inode->i_size; loff_t old_disksize; int shrink = (attr->ia_size < inode->i_size); if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); if (attr->ia_size > sbi->s_bitmap_maxbytes) { return -EFBIG; } } if (!S_ISREG(inode->i_mode)) { return -EINVAL; } if (attr->ia_size == inode->i_size) inc_ivers = false; if (shrink) { if (ext4_should_order_data(inode)) { error = ext4_begin_ordered_truncate(inode, attr->ia_size); if (error) goto err_out; } /* * Blocks are going to be removed from the inode. Wait * for dio in flight. */ inode_dio_wait(inode); } filemap_invalidate_lock(inode->i_mapping); rc = ext4_break_layouts(inode); if (rc) { filemap_invalidate_unlock(inode->i_mapping); goto err_out; } if (attr->ia_size != inode->i_size) { /* attach jbd2 jinode for EOF folio tail zeroing */ if (attr->ia_size & (inode->i_sb->s_blocksize - 1) || oldsize & (inode->i_sb->s_blocksize - 1)) { error = ext4_inode_attach_jinode(inode); if (error) goto out_mmap_sem; } handle = ext4_journal_start(inode, EXT4_HT_INODE, 3); if (IS_ERR(handle)) { error = PTR_ERR(handle); goto out_mmap_sem; } if (ext4_handle_valid(handle) && shrink) { error = ext4_orphan_add(handle, inode); orphan = 1; } /* * Update c/mtime and tail zero the EOF folio on * truncate up. ext4_truncate() handles the shrink case * below. */ if (!shrink) { inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); if (oldsize & (inode->i_sb->s_blocksize - 1)) ext4_block_truncate_page(handle, inode->i_mapping, oldsize); } if (shrink) ext4_fc_track_range(handle, inode, (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >> inode->i_sb->s_blocksize_bits, EXT_MAX_BLOCKS - 1); else ext4_fc_track_range( handle, inode, (oldsize > 0 ? oldsize - 1 : oldsize) >> inode->i_sb->s_blocksize_bits, (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >> inode->i_sb->s_blocksize_bits); down_write(&EXT4_I(inode)->i_data_sem); old_disksize = EXT4_I(inode)->i_disksize; EXT4_I(inode)->i_disksize = attr->ia_size; /* * We have to update i_size under i_data_sem together * with i_disksize to avoid races with writeback code * running ext4_wb_update_i_disksize(). */ if (!error) i_size_write(inode, attr->ia_size); else EXT4_I(inode)->i_disksize = old_disksize; up_write(&EXT4_I(inode)->i_data_sem); rc = ext4_mark_inode_dirty(handle, inode); if (!error) error = rc; ext4_journal_stop(handle); if (error) goto out_mmap_sem; if (!shrink) { pagecache_isize_extended(inode, oldsize, inode->i_size); } else if (ext4_should_journal_data(inode)) { ext4_wait_for_tail_page_commit(inode); } }