Total coverage: 328208 (18%)of 1851547
377 4 11 6 1 1 1 20406 2 20145 20 3721 1 8563 17577 20284 666 11 1155 20 692 9 14 80 150 5 5 19421 15010 15072 8 2 2 1137 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 // SPDX-License-Identifier: GPL-2.0 // Generated by scripts/atomic/gen-atomic-fallback.sh // DO NOT MODIFY THIS FILE DIRECTLY #ifndef _LINUX_ATOMIC_FALLBACK_H #define _LINUX_ATOMIC_FALLBACK_H #include <linux/compiler.h> #if defined(arch_xchg) #define raw_xchg arch_xchg #elif defined(arch_xchg_relaxed) #define raw_xchg(...) \ __atomic_op_fence(arch_xchg, __VA_ARGS__) #else extern void raw_xchg_not_implemented(void); #define raw_xchg(...) raw_xchg_not_implemented() #endif #if defined(arch_xchg_acquire) #define raw_xchg_acquire arch_xchg_acquire #elif defined(arch_xchg_relaxed) #define raw_xchg_acquire(...) \ __atomic_op_acquire(arch_xchg, __VA_ARGS__) #elif defined(arch_xchg) #define raw_xchg_acquire arch_xchg #else extern void raw_xchg_acquire_not_implemented(void); #define raw_xchg_acquire(...) raw_xchg_acquire_not_implemented() #endif #if defined(arch_xchg_release) #define raw_xchg_release arch_xchg_release #elif defined(arch_xchg_relaxed) #define raw_xchg_release(...) \ __atomic_op_release(arch_xchg, __VA_ARGS__) #elif defined(arch_xchg) #define raw_xchg_release arch_xchg #else extern void raw_xchg_release_not_implemented(void); #define raw_xchg_release(...) raw_xchg_release_not_implemented() #endif #if defined(arch_xchg_relaxed) #define raw_xchg_relaxed arch_xchg_relaxed #elif defined(arch_xchg) #define raw_xchg_relaxed arch_xchg #else extern void raw_xchg_relaxed_not_implemented(void); #define raw_xchg_relaxed(...) raw_xchg_relaxed_not_implemented() #endif #if defined(arch_cmpxchg) #define raw_cmpxchg arch_cmpxchg #elif defined(arch_cmpxchg_relaxed) #define raw_cmpxchg(...) \ __atomic_op_fence(arch_cmpxchg, __VA_ARGS__) #else extern void raw_cmpxchg_not_implemented(void); #define raw_cmpxchg(...) raw_cmpxchg_not_implemented() #endif #if defined(arch_cmpxchg_acquire) #define raw_cmpxchg_acquire arch_cmpxchg_acquire #elif defined(arch_cmpxchg_relaxed) #define raw_cmpxchg_acquire(...) \ __atomic_op_acquire(arch_cmpxchg, __VA_ARGS__) #elif defined(arch_cmpxchg) #define raw_cmpxchg_acquire arch_cmpxchg #else extern void raw_cmpxchg_acquire_not_implemented(void); #define raw_cmpxchg_acquire(...) raw_cmpxchg_acquire_not_implemented() #endif #if defined(arch_cmpxchg_release) #define raw_cmpxchg_release arch_cmpxchg_release #elif defined(arch_cmpxchg_relaxed) #define raw_cmpxchg_release(...) \ __atomic_op_release(arch_cmpxchg, __VA_ARGS__) #elif defined(arch_cmpxchg) #define raw_cmpxchg_release arch_cmpxchg #else extern void raw_cmpxchg_release_not_implemented(void); #define raw_cmpxchg_release(...) raw_cmpxchg_release_not_implemented() #endif #if defined(arch_cmpxchg_relaxed) #define raw_cmpxchg_relaxed arch_cmpxchg_relaxed #elif defined(arch_cmpxchg) #define raw_cmpxchg_relaxed arch_cmpxchg #else extern void raw_cmpxchg_relaxed_not_implemented(void); #define raw_cmpxchg_relaxed(...) raw_cmpxchg_relaxed_not_implemented() #endif #if defined(arch_cmpxchg64) #define raw_cmpxchg64 arch_cmpxchg64 #elif defined(arch_cmpxchg64_relaxed) #define raw_cmpxchg64(...) \ __atomic_op_fence(arch_cmpxchg64, __VA_ARGS__) #else extern void raw_cmpxchg64_not_implemented(void); #define raw_cmpxchg64(...) raw_cmpxchg64_not_implemented() #endif #if defined(arch_cmpxchg64_acquire) #define raw_cmpxchg64_acquire arch_cmpxchg64_acquire #elif defined(arch_cmpxchg64_relaxed) #define raw_cmpxchg64_acquire(...) \ __atomic_op_acquire(arch_cmpxchg64, __VA_ARGS__) #elif defined(arch_cmpxchg64) #define raw_cmpxchg64_acquire arch_cmpxchg64 #else extern void raw_cmpxchg64_acquire_not_implemented(void); #define raw_cmpxchg64_acquire(...) raw_cmpxchg64_acquire_not_implemented() #endif #if defined(arch_cmpxchg64_release) #define raw_cmpxchg64_release arch_cmpxchg64_release #elif defined(arch_cmpxchg64_relaxed) #define raw_cmpxchg64_release(...) \ __atomic_op_release(arch_cmpxchg64, __VA_ARGS__) #elif defined(arch_cmpxchg64) #define raw_cmpxchg64_release arch_cmpxchg64 #else extern void raw_cmpxchg64_release_not_implemented(void); #define raw_cmpxchg64_release(...) raw_cmpxchg64_release_not_implemented() #endif #if defined(arch_cmpxchg64_relaxed) #define raw_cmpxchg64_relaxed arch_cmpxchg64_relaxed #elif defined(arch_cmpxchg64) #define raw_cmpxchg64_relaxed arch_cmpxchg64 #else extern void raw_cmpxchg64_relaxed_not_implemented(void); #define raw_cmpxchg64_relaxed(...) raw_cmpxchg64_relaxed_not_implemented() #endif #if defined(arch_cmpxchg128) #define raw_cmpxchg128 arch_cmpxchg128 #elif defined(arch_cmpxchg128_relaxed) #define raw_cmpxchg128(...) \ __atomic_op_fence(arch_cmpxchg128, __VA_ARGS__) #else extern void raw_cmpxchg128_not_implemented(void); #define raw_cmpxchg128(...) raw_cmpxchg128_not_implemented() #endif #if defined(arch_cmpxchg128_acquire) #define raw_cmpxchg128_acquire arch_cmpxchg128_acquire #elif defined(arch_cmpxchg128_relaxed) #define raw_cmpxchg128_acquire(...) \ __atomic_op_acquire(arch_cmpxchg128, __VA_ARGS__) #elif defined(arch_cmpxchg128) #define raw_cmpxchg128_acquire arch_cmpxchg128 #else extern void raw_cmpxchg128_acquire_not_implemented(void); #define raw_cmpxchg128_acquire(...) raw_cmpxchg128_acquire_not_implemented() #endif #if defined(arch_cmpxchg128_release) #define raw_cmpxchg128_release arch_cmpxchg128_release #elif defined(arch_cmpxchg128_relaxed) #define raw_cmpxchg128_release(...) \ __atomic_op_release(arch_cmpxchg128, __VA_ARGS__) #elif defined(arch_cmpxchg128) #define raw_cmpxchg128_release arch_cmpxchg128 #else extern void raw_cmpxchg128_release_not_implemented(void); #define raw_cmpxchg128_release(...) raw_cmpxchg128_release_not_implemented() #endif #if defined(arch_cmpxchg128_relaxed) #define raw_cmpxchg128_relaxed arch_cmpxchg128_relaxed #elif defined(arch_cmpxchg128) #define raw_cmpxchg128_relaxed arch_cmpxchg128 #else extern void raw_cmpxchg128_relaxed_not_implemented(void); #define raw_cmpxchg128_relaxed(...) raw_cmpxchg128_relaxed_not_implemented() #endif #if defined(arch_try_cmpxchg) #define raw_try_cmpxchg arch_try_cmpxchg #elif defined(arch_try_cmpxchg_relaxed) #define raw_try_cmpxchg(...) \ __atomic_op_fence(arch_try_cmpxchg, __VA_ARGS__) #else #define raw_try_cmpxchg(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #if defined(arch_try_cmpxchg_acquire) #define raw_try_cmpxchg_acquire arch_try_cmpxchg_acquire #elif defined(arch_try_cmpxchg_relaxed) #define raw_try_cmpxchg_acquire(...) \ __atomic_op_acquire(arch_try_cmpxchg, __VA_ARGS__) #elif defined(arch_try_cmpxchg) #define raw_try_cmpxchg_acquire arch_try_cmpxchg #else #define raw_try_cmpxchg_acquire(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg_acquire((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #if defined(arch_try_cmpxchg_release) #define raw_try_cmpxchg_release arch_try_cmpxchg_release #elif defined(arch_try_cmpxchg_relaxed) #define raw_try_cmpxchg_release(...) \ __atomic_op_release(arch_try_cmpxchg, __VA_ARGS__) #elif defined(arch_try_cmpxchg) #define raw_try_cmpxchg_release arch_try_cmpxchg #else #define raw_try_cmpxchg_release(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg_release((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #if defined(arch_try_cmpxchg_relaxed) #define raw_try_cmpxchg_relaxed arch_try_cmpxchg_relaxed #elif defined(arch_try_cmpxchg) #define raw_try_cmpxchg_relaxed arch_try_cmpxchg #else #define raw_try_cmpxchg_relaxed(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg_relaxed((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #if defined(arch_try_cmpxchg64) #define raw_try_cmpxchg64 arch_try_cmpxchg64 #elif defined(arch_try_cmpxchg64_relaxed) #define raw_try_cmpxchg64(...) \ __atomic_op_fence(arch_try_cmpxchg64, __VA_ARGS__) #else #define raw_try_cmpxchg64(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg64((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #if defined(arch_try_cmpxchg64_acquire) #define raw_try_cmpxchg64_acquire arch_try_cmpxchg64_acquire #elif defined(arch_try_cmpxchg64_relaxed) #define raw_try_cmpxchg64_acquire(...) \ __atomic_op_acquire(arch_try_cmpxchg64, __VA_ARGS__) #elif defined(arch_try_cmpxchg64) #define raw_try_cmpxchg64_acquire arch_try_cmpxchg64 #else #define raw_try_cmpxchg64_acquire(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg64_acquire((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #if defined(arch_try_cmpxchg64_release) #define raw_try_cmpxchg64_release arch_try_cmpxchg64_release #elif defined(arch_try_cmpxchg64_relaxed) #define raw_try_cmpxchg64_release(...) \ __atomic_op_release(arch_try_cmpxchg64, __VA_ARGS__) #elif defined(arch_try_cmpxchg64) #define raw_try_cmpxchg64_release arch_try_cmpxchg64 #else #define raw_try_cmpxchg64_release(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg64_release((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #if defined(arch_try_cmpxchg64_relaxed) #define raw_try_cmpxchg64_relaxed arch_try_cmpxchg64_relaxed #elif defined(arch_try_cmpxchg64) #define raw_try_cmpxchg64_relaxed arch_try_cmpxchg64 #else #define raw_try_cmpxchg64_relaxed(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg64_relaxed((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #if defined(arch_try_cmpxchg128) #define raw_try_cmpxchg128 arch_try_cmpxchg128 #elif defined(arch_try_cmpxchg128_relaxed) #define raw_try_cmpxchg128(...) \ __atomic_op_fence(arch_try_cmpxchg128, __VA_ARGS__) #else #define raw_try_cmpxchg128(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg128((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #if defined(arch_try_cmpxchg128_acquire) #define raw_try_cmpxchg128_acquire arch_try_cmpxchg128_acquire #elif defined(arch_try_cmpxchg128_relaxed) #define raw_try_cmpxchg128_acquire(...) \ __atomic_op_acquire(arch_try_cmpxchg128, __VA_ARGS__) #elif defined(arch_try_cmpxchg128) #define raw_try_cmpxchg128_acquire arch_try_cmpxchg128 #else #define raw_try_cmpxchg128_acquire(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg128_acquire((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #if defined(arch_try_cmpxchg128_release) #define raw_try_cmpxchg128_release arch_try_cmpxchg128_release #elif defined(arch_try_cmpxchg128_relaxed) #define raw_try_cmpxchg128_release(...) \ __atomic_op_release(arch_try_cmpxchg128, __VA_ARGS__) #elif defined(arch_try_cmpxchg128) #define raw_try_cmpxchg128_release arch_try_cmpxchg128 #else #define raw_try_cmpxchg128_release(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg128_release((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #if defined(arch_try_cmpxchg128_relaxed) #define raw_try_cmpxchg128_relaxed arch_try_cmpxchg128_relaxed #elif defined(arch_try_cmpxchg128) #define raw_try_cmpxchg128_relaxed arch_try_cmpxchg128 #else #define raw_try_cmpxchg128_relaxed(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg128_relaxed((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #define raw_cmpxchg_local arch_cmpxchg_local #ifdef arch_try_cmpxchg_local #define raw_try_cmpxchg_local arch_try_cmpxchg_local #else #define raw_try_cmpxchg_local(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg_local((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #define raw_cmpxchg64_local arch_cmpxchg64_local #ifdef arch_try_cmpxchg64_local #define raw_try_cmpxchg64_local arch_try_cmpxchg64_local #else #define raw_try_cmpxchg64_local(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg64_local((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #define raw_cmpxchg128_local arch_cmpxchg128_local #ifdef arch_try_cmpxchg128_local #define raw_try_cmpxchg128_local arch_try_cmpxchg128_local #else #define raw_try_cmpxchg128_local(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg128_local((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #define raw_sync_cmpxchg arch_sync_cmpxchg #ifdef arch_sync_try_cmpxchg #define raw_sync_try_cmpxchg arch_sync_try_cmpxchg #else #define raw_sync_try_cmpxchg(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_sync_cmpxchg((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif /** * raw_atomic_read() - atomic load with relaxed ordering * @v: pointer to atomic_t * * Atomically loads the value of @v with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_read() elsewhere. * * Return: The value loaded from @v. */ static __always_inline int raw_atomic_read(const atomic_t *v) { return arch_atomic_read(v); } /** * raw_atomic_read_acquire() - atomic load with acquire ordering * @v: pointer to atomic_t * * Atomically loads the value of @v with acquire ordering. * * Safe to use in noinstr code; prefer atomic_read_acquire() elsewhere. * * Return: The value loaded from @v. */ static __always_inline int raw_atomic_read_acquire(const atomic_t *v) { #if defined(arch_atomic_read_acquire) return arch_atomic_read_acquire(v); #else int ret; if (__native_word(atomic_t)) { ret = smp_load_acquire(&(v)->counter); } else { ret = raw_atomic_read(v); __atomic_acquire_fence(); } return ret; #endif } /** * raw_atomic_set() - atomic set with relaxed ordering * @v: pointer to atomic_t * @i: int value to assign * * Atomically sets @v to @i with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_set() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_set(atomic_t *v, int i) { arch_atomic_set(v, i); } /** * raw_atomic_set_release() - atomic set with release ordering * @v: pointer to atomic_t * @i: int value to assign * * Atomically sets @v to @i with release ordering. * * Safe to use in noinstr code; prefer atomic_set_release() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_set_release(atomic_t *v, int i) { #if defined(arch_atomic_set_release) arch_atomic_set_release(v, i); #else if (__native_word(atomic_t)) { smp_store_release(&(v)->counter, i); } else { __atomic_release_fence(); raw_atomic_set(v, i); } #endif } /** * raw_atomic_add() - atomic add with relaxed ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_add() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_add(int i, atomic_t *v) { arch_atomic_add(i, v); } /** * raw_atomic_add_return() - atomic add with full ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_add_return() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_add_return(int i, atomic_t *v) { #if defined(arch_atomic_add_return) return arch_atomic_add_return(i, v); #elif defined(arch_atomic_add_return_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_add_return_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic_add_return" #endif } /** * raw_atomic_add_return_acquire() - atomic add with acquire ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_add_return_acquire() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_add_return_acquire(int i, atomic_t *v) { #if defined(arch_atomic_add_return_acquire) return arch_atomic_add_return_acquire(i, v); #elif defined(arch_atomic_add_return_relaxed) int ret = arch_atomic_add_return_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_add_return) return arch_atomic_add_return(i, v); #else #error "Unable to define raw_atomic_add_return_acquire" #endif } /** * raw_atomic_add_return_release() - atomic add with release ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_add_return_release() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_add_return_release(int i, atomic_t *v) { #if defined(arch_atomic_add_return_release) return arch_atomic_add_return_release(i, v); #elif defined(arch_atomic_add_return_relaxed) __atomic_release_fence(); return arch_atomic_add_return_relaxed(i, v); #elif defined(arch_atomic_add_return) return arch_atomic_add_return(i, v); #else #error "Unable to define raw_atomic_add_return_release" #endif } /** * raw_atomic_add_return_relaxed() - atomic add with relaxed ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_add_return_relaxed() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_add_return_relaxed(int i, atomic_t *v) { #if defined(arch_atomic_add_return_relaxed) return arch_atomic_add_return_relaxed(i, v); #elif defined(arch_atomic_add_return) return arch_atomic_add_return(i, v); #else #error "Unable to define raw_atomic_add_return_relaxed" #endif } /** * raw_atomic_fetch_add() - atomic add with full ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_fetch_add() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_add(int i, atomic_t *v) { #if defined(arch_atomic_fetch_add) return arch_atomic_fetch_add(i, v); #elif defined(arch_atomic_fetch_add_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_fetch_add_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic_fetch_add" #endif } /** * raw_atomic_fetch_add_acquire() - atomic add with acquire ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_fetch_add_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_add_acquire(int i, atomic_t *v) { #if defined(arch_atomic_fetch_add_acquire) return arch_atomic_fetch_add_acquire(i, v); #elif defined(arch_atomic_fetch_add_relaxed) int ret = arch_atomic_fetch_add_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_fetch_add) return arch_atomic_fetch_add(i, v); #else #error "Unable to define raw_atomic_fetch_add_acquire" #endif } /** * raw_atomic_fetch_add_release() - atomic add with release ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_fetch_add_release() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_add_release(int i, atomic_t *v) { #if defined(arch_atomic_fetch_add_release) return arch_atomic_fetch_add_release(i, v); #elif defined(arch_atomic_fetch_add_relaxed) __atomic_release_fence(); return arch_atomic_fetch_add_relaxed(i, v); #elif defined(arch_atomic_fetch_add) return arch_atomic_fetch_add(i, v); #else #error "Unable to define raw_atomic_fetch_add_release" #endif } /** * raw_atomic_fetch_add_relaxed() - atomic add with relaxed ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_fetch_add_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_add_relaxed(int i, atomic_t *v) { #if defined(arch_atomic_fetch_add_relaxed) return arch_atomic_fetch_add_relaxed(i, v); #elif defined(arch_atomic_fetch_add) return arch_atomic_fetch_add(i, v); #else #error "Unable to define raw_atomic_fetch_add_relaxed" #endif } /** * raw_atomic_sub() - atomic subtract with relaxed ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_sub() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_sub(int i, atomic_t *v) { arch_atomic_sub(i, v); } /** * raw_atomic_sub_return() - atomic subtract with full ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_sub_return() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_sub_return(int i, atomic_t *v) { #if defined(arch_atomic_sub_return) return arch_atomic_sub_return(i, v); #elif defined(arch_atomic_sub_return_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_sub_return_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic_sub_return" #endif } /** * raw_atomic_sub_return_acquire() - atomic subtract with acquire ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_sub_return_acquire() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_sub_return_acquire(int i, atomic_t *v) { #if defined(arch_atomic_sub_return_acquire) return arch_atomic_sub_return_acquire(i, v); #elif defined(arch_atomic_sub_return_relaxed) int ret = arch_atomic_sub_return_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_sub_return) return arch_atomic_sub_return(i, v); #else #error "Unable to define raw_atomic_sub_return_acquire" #endif } /** * raw_atomic_sub_return_release() - atomic subtract with release ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_sub_return_release() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_sub_return_release(int i, atomic_t *v) { #if defined(arch_atomic_sub_return_release) return arch_atomic_sub_return_release(i, v); #elif defined(arch_atomic_sub_return_relaxed) __atomic_release_fence(); return arch_atomic_sub_return_relaxed(i, v); #elif defined(arch_atomic_sub_return) return arch_atomic_sub_return(i, v); #else #error "Unable to define raw_atomic_sub_return_release" #endif } /** * raw_atomic_sub_return_relaxed() - atomic subtract with relaxed ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_sub_return_relaxed() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_sub_return_relaxed(int i, atomic_t *v) { #if defined(arch_atomic_sub_return_relaxed) return arch_atomic_sub_return_relaxed(i, v); #elif defined(arch_atomic_sub_return) return arch_atomic_sub_return(i, v); #else #error "Unable to define raw_atomic_sub_return_relaxed" #endif } /** * raw_atomic_fetch_sub() - atomic subtract with full ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_fetch_sub() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_sub(int i, atomic_t *v) { #if defined(arch_atomic_fetch_sub) return arch_atomic_fetch_sub(i, v); #elif defined(arch_atomic_fetch_sub_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_fetch_sub_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic_fetch_sub" #endif } /** * raw_atomic_fetch_sub_acquire() - atomic subtract with acquire ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_fetch_sub_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_sub_acquire(int i, atomic_t *v) { #if defined(arch_atomic_fetch_sub_acquire) return arch_atomic_fetch_sub_acquire(i, v); #elif defined(arch_atomic_fetch_sub_relaxed) int ret = arch_atomic_fetch_sub_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_fetch_sub) return arch_atomic_fetch_sub(i, v); #else #error "Unable to define raw_atomic_fetch_sub_acquire" #endif } /** * raw_atomic_fetch_sub_release() - atomic subtract with release ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_fetch_sub_release() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_sub_release(int i, atomic_t *v) { #if defined(arch_atomic_fetch_sub_release) return arch_atomic_fetch_sub_release(i, v); #elif defined(arch_atomic_fetch_sub_relaxed) __atomic_release_fence(); return arch_atomic_fetch_sub_relaxed(i, v); #elif defined(arch_atomic_fetch_sub) return arch_atomic_fetch_sub(i, v); #else #error "Unable to define raw_atomic_fetch_sub_release" #endif } /** * raw_atomic_fetch_sub_relaxed() - atomic subtract with relaxed ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_fetch_sub_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_sub_relaxed(int i, atomic_t *v) { #if defined(arch_atomic_fetch_sub_relaxed) return arch_atomic_fetch_sub_relaxed(i, v); #elif defined(arch_atomic_fetch_sub) return arch_atomic_fetch_sub(i, v); #else #error "Unable to define raw_atomic_fetch_sub_relaxed" #endif } /** * raw_atomic_inc() - atomic increment with relaxed ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_inc() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_inc(atomic_t *v) { #if defined(arch_atomic_inc) arch_atomic_inc(v); #else raw_atomic_add(1, v); #endif } /** * raw_atomic_inc_return() - atomic increment with full ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_inc_return() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_inc_return(atomic_t *v) { #if defined(arch_atomic_inc_return) return arch_atomic_inc_return(v); #elif defined(arch_atomic_inc_return_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_inc_return_relaxed(v); __atomic_post_full_fence(); return ret; #else return raw_atomic_add_return(1, v); #endif } /** * raw_atomic_inc_return_acquire() - atomic increment with acquire ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_inc_return_acquire() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_inc_return_acquire(atomic_t *v) { #if defined(arch_atomic_inc_return_acquire) return arch_atomic_inc_return_acquire(v); #elif defined(arch_atomic_inc_return_relaxed) int ret = arch_atomic_inc_return_relaxed(v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_inc_return) return arch_atomic_inc_return(v); #else return raw_atomic_add_return_acquire(1, v); #endif } /** * raw_atomic_inc_return_release() - atomic increment with release ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with release ordering. * * Safe to use in noinstr code; prefer atomic_inc_return_release() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_inc_return_release(atomic_t *v) { #if defined(arch_atomic_inc_return_release) return arch_atomic_inc_return_release(v); #elif defined(arch_atomic_inc_return_relaxed) __atomic_release_fence(); return arch_atomic_inc_return_relaxed(v); #elif defined(arch_atomic_inc_return) return arch_atomic_inc_return(v); #else return raw_atomic_add_return_release(1, v); #endif } /** * raw_atomic_inc_return_relaxed() - atomic increment with relaxed ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_inc_return_relaxed() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_inc_return_relaxed(atomic_t *v) { #if defined(arch_atomic_inc_return_relaxed) return arch_atomic_inc_return_relaxed(v); #elif defined(arch_atomic_inc_return) return arch_atomic_inc_return(v); #else return raw_atomic_add_return_relaxed(1, v); #endif } /** * raw_atomic_fetch_inc() - atomic increment with full ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_fetch_inc() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_inc(atomic_t *v) { #if defined(arch_atomic_fetch_inc) return arch_atomic_fetch_inc(v); #elif defined(arch_atomic_fetch_inc_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_fetch_inc_relaxed(v); __atomic_post_full_fence(); return ret; #else return raw_atomic_fetch_add(1, v); #endif } /** * raw_atomic_fetch_inc_acquire() - atomic increment with acquire ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_fetch_inc_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_inc_acquire(atomic_t *v) { #if defined(arch_atomic_fetch_inc_acquire) return arch_atomic_fetch_inc_acquire(v); #elif defined(arch_atomic_fetch_inc_relaxed) int ret = arch_atomic_fetch_inc_relaxed(v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_fetch_inc) return arch_atomic_fetch_inc(v); #else return raw_atomic_fetch_add_acquire(1, v); #endif } /** * raw_atomic_fetch_inc_release() - atomic increment with release ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with release ordering. * * Safe to use in noinstr code; prefer atomic_fetch_inc_release() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_inc_release(atomic_t *v) { #if defined(arch_atomic_fetch_inc_release) return arch_atomic_fetch_inc_release(v); #elif defined(arch_atomic_fetch_inc_relaxed) __atomic_release_fence(); return arch_atomic_fetch_inc_relaxed(v); #elif defined(arch_atomic_fetch_inc) return arch_atomic_fetch_inc(v); #else return raw_atomic_fetch_add_release(1, v); #endif } /** * raw_atomic_fetch_inc_relaxed() - atomic increment with relaxed ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_fetch_inc_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_inc_relaxed(atomic_t *v) { #if defined(arch_atomic_fetch_inc_relaxed) return arch_atomic_fetch_inc_relaxed(v); #elif defined(arch_atomic_fetch_inc) return arch_atomic_fetch_inc(v); #else return raw_atomic_fetch_add_relaxed(1, v); #endif } /** * raw_atomic_dec() - atomic decrement with relaxed ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_dec() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_dec(atomic_t *v) { #if defined(arch_atomic_dec) arch_atomic_dec(v); #else raw_atomic_sub(1, v); #endif } /** * raw_atomic_dec_return() - atomic decrement with full ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_dec_return() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_dec_return(atomic_t *v) { #if defined(arch_atomic_dec_return) return arch_atomic_dec_return(v); #elif defined(arch_atomic_dec_return_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_dec_return_relaxed(v); __atomic_post_full_fence(); return ret; #else return raw_atomic_sub_return(1, v); #endif } /** * raw_atomic_dec_return_acquire() - atomic decrement with acquire ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_dec_return_acquire() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_dec_return_acquire(atomic_t *v) { #if defined(arch_atomic_dec_return_acquire) return arch_atomic_dec_return_acquire(v); #elif defined(arch_atomic_dec_return_relaxed) int ret = arch_atomic_dec_return_relaxed(v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_dec_return) return arch_atomic_dec_return(v); #else return raw_atomic_sub_return_acquire(1, v); #endif } /** * raw_atomic_dec_return_release() - atomic decrement with release ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with release ordering. * * Safe to use in noinstr code; prefer atomic_dec_return_release() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_dec_return_release(atomic_t *v) { #if defined(arch_atomic_dec_return_release) return arch_atomic_dec_return_release(v); #elif defined(arch_atomic_dec_return_relaxed) __atomic_release_fence(); return arch_atomic_dec_return_relaxed(v); #elif defined(arch_atomic_dec_return) return arch_atomic_dec_return(v); #else return raw_atomic_sub_return_release(1, v); #endif } /** * raw_atomic_dec_return_relaxed() - atomic decrement with relaxed ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_dec_return_relaxed() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_dec_return_relaxed(atomic_t *v) { #if defined(arch_atomic_dec_return_relaxed) return arch_atomic_dec_return_relaxed(v); #elif defined(arch_atomic_dec_return) return arch_atomic_dec_return(v); #else return raw_atomic_sub_return_relaxed(1, v); #endif } /** * raw_atomic_fetch_dec() - atomic decrement with full ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_fetch_dec() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_dec(atomic_t *v) { #if defined(arch_atomic_fetch_dec) return arch_atomic_fetch_dec(v); #elif defined(arch_atomic_fetch_dec_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_fetch_dec_relaxed(v); __atomic_post_full_fence(); return ret; #else return raw_atomic_fetch_sub(1, v); #endif } /** * raw_atomic_fetch_dec_acquire() - atomic decrement with acquire ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_fetch_dec_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_dec_acquire(atomic_t *v) { #if defined(arch_atomic_fetch_dec_acquire) return arch_atomic_fetch_dec_acquire(v); #elif defined(arch_atomic_fetch_dec_relaxed) int ret = arch_atomic_fetch_dec_relaxed(v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_fetch_dec) return arch_atomic_fetch_dec(v); #else return raw_atomic_fetch_sub_acquire(1, v); #endif } /** * raw_atomic_fetch_dec_release() - atomic decrement with release ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with release ordering. * * Safe to use in noinstr code; prefer atomic_fetch_dec_release() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_dec_release(atomic_t *v) { #if defined(arch_atomic_fetch_dec_release) return arch_atomic_fetch_dec_release(v); #elif defined(arch_atomic_fetch_dec_relaxed) __atomic_release_fence(); return arch_atomic_fetch_dec_relaxed(v); #elif defined(arch_atomic_fetch_dec) return arch_atomic_fetch_dec(v); #else return raw_atomic_fetch_sub_release(1, v); #endif } /** * raw_atomic_fetch_dec_relaxed() - atomic decrement with relaxed ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_fetch_dec_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_dec_relaxed(atomic_t *v) { #if defined(arch_atomic_fetch_dec_relaxed) return arch_atomic_fetch_dec_relaxed(v); #elif defined(arch_atomic_fetch_dec) return arch_atomic_fetch_dec(v); #else return raw_atomic_fetch_sub_relaxed(1, v); #endif } /** * raw_atomic_and() - atomic bitwise AND with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_and() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_and(int i, atomic_t *v) { arch_atomic_and(i, v); } /** * raw_atomic_fetch_and() - atomic bitwise AND with full ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_fetch_and() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_and(int i, atomic_t *v) { #if defined(arch_atomic_fetch_and) return arch_atomic_fetch_and(i, v); #elif defined(arch_atomic_fetch_and_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_fetch_and_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic_fetch_and" #endif } /** * raw_atomic_fetch_and_acquire() - atomic bitwise AND with acquire ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_fetch_and_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_and_acquire(int i, atomic_t *v) { #if defined(arch_atomic_fetch_and_acquire) return arch_atomic_fetch_and_acquire(i, v); #elif defined(arch_atomic_fetch_and_relaxed) int ret = arch_atomic_fetch_and_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_fetch_and) return arch_atomic_fetch_and(i, v); #else #error "Unable to define raw_atomic_fetch_and_acquire" #endif } /** * raw_atomic_fetch_and_release() - atomic bitwise AND with release ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_fetch_and_release() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_and_release(int i, atomic_t *v) { #if defined(arch_atomic_fetch_and_release) return arch_atomic_fetch_and_release(i, v); #elif defined(arch_atomic_fetch_and_relaxed) __atomic_release_fence(); return arch_atomic_fetch_and_relaxed(i, v); #elif defined(arch_atomic_fetch_and) return arch_atomic_fetch_and(i, v); #else #error "Unable to define raw_atomic_fetch_and_release" #endif } /** * raw_atomic_fetch_and_relaxed() - atomic bitwise AND with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_fetch_and_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_and_relaxed(int i, atomic_t *v) { #if defined(arch_atomic_fetch_and_relaxed) return arch_atomic_fetch_and_relaxed(i, v); #elif defined(arch_atomic_fetch_and) return arch_atomic_fetch_and(i, v); #else #error "Unable to define raw_atomic_fetch_and_relaxed" #endif } /** * raw_atomic_andnot() - atomic bitwise AND NOT with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & ~@i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_andnot() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_andnot(int i, atomic_t *v) { #if defined(arch_atomic_andnot) arch_atomic_andnot(i, v); #else raw_atomic_and(~i, v); #endif } /** * raw_atomic_fetch_andnot() - atomic bitwise AND NOT with full ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & ~@i) with full ordering. * * Safe to use in noinstr code; prefer atomic_fetch_andnot() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_andnot(int i, atomic_t *v) { #if defined(arch_atomic_fetch_andnot) return arch_atomic_fetch_andnot(i, v); #elif defined(arch_atomic_fetch_andnot_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_fetch_andnot_relaxed(i, v); __atomic_post_full_fence(); return ret; #else return raw_atomic_fetch_and(~i, v); #endif } /** * raw_atomic_fetch_andnot_acquire() - atomic bitwise AND NOT with acquire ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & ~@i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_fetch_andnot_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_andnot_acquire(int i, atomic_t *v) { #if defined(arch_atomic_fetch_andnot_acquire) return arch_atomic_fetch_andnot_acquire(i, v); #elif defined(arch_atomic_fetch_andnot_relaxed) int ret = arch_atomic_fetch_andnot_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_fetch_andnot) return arch_atomic_fetch_andnot(i, v); #else return raw_atomic_fetch_and_acquire(~i, v); #endif } /** * raw_atomic_fetch_andnot_release() - atomic bitwise AND NOT with release ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & ~@i) with release ordering. * * Safe to use in noinstr code; prefer atomic_fetch_andnot_release() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_andnot_release(int i, atomic_t *v) { #if defined(arch_atomic_fetch_andnot_release) return arch_atomic_fetch_andnot_release(i, v); #elif defined(arch_atomic_fetch_andnot_relaxed) __atomic_release_fence(); return arch_atomic_fetch_andnot_relaxed(i, v); #elif defined(arch_atomic_fetch_andnot) return arch_atomic_fetch_andnot(i, v); #else return raw_atomic_fetch_and_release(~i, v); #endif } /** * raw_atomic_fetch_andnot_relaxed() - atomic bitwise AND NOT with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & ~@i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_fetch_andnot_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_andnot_relaxed(int i, atomic_t *v) { #if defined(arch_atomic_fetch_andnot_relaxed) return arch_atomic_fetch_andnot_relaxed(i, v); #elif defined(arch_atomic_fetch_andnot) return arch_atomic_fetch_andnot(i, v); #else return raw_atomic_fetch_and_relaxed(~i, v); #endif } /** * raw_atomic_or() - atomic bitwise OR with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v | @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_or() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_or(int i, atomic_t *v) { arch_atomic_or(i, v); } /** * raw_atomic_fetch_or() - atomic bitwise OR with full ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v | @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_fetch_or() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_or(int i, atomic_t *v) { #if defined(arch_atomic_fetch_or) return arch_atomic_fetch_or(i, v); #elif defined(arch_atomic_fetch_or_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_fetch_or_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic_fetch_or" #endif } /** * raw_atomic_fetch_or_acquire() - atomic bitwise OR with acquire ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v | @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_fetch_or_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_or_acquire(int i, atomic_t *v) { #if defined(arch_atomic_fetch_or_acquire) return arch_atomic_fetch_or_acquire(i, v); #elif defined(arch_atomic_fetch_or_relaxed) int ret = arch_atomic_fetch_or_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_fetch_or) return arch_atomic_fetch_or(i, v); #else #error "Unable to define raw_atomic_fetch_or_acquire" #endif } /** * raw_atomic_fetch_or_release() - atomic bitwise OR with release ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v | @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_fetch_or_release() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_or_release(int i, atomic_t *v) { #if defined(arch_atomic_fetch_or_release) return arch_atomic_fetch_or_release(i, v); #elif defined(arch_atomic_fetch_or_relaxed) __atomic_release_fence(); return arch_atomic_fetch_or_relaxed(i, v); #elif defined(arch_atomic_fetch_or) return arch_atomic_fetch_or(i, v); #else #error "Unable to define raw_atomic_fetch_or_release" #endif } /** * raw_atomic_fetch_or_relaxed() - atomic bitwise OR with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v | @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_fetch_or_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_or_relaxed(int i, atomic_t *v) { #if defined(arch_atomic_fetch_or_relaxed) return arch_atomic_fetch_or_relaxed(i, v); #elif defined(arch_atomic_fetch_or) return arch_atomic_fetch_or(i, v); #else #error "Unable to define raw_atomic_fetch_or_relaxed" #endif } /** * raw_atomic_xor() - atomic bitwise XOR with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v ^ @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_xor() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_xor(int i, atomic_t *v) { arch_atomic_xor(i, v); } /** * raw_atomic_fetch_xor() - atomic bitwise XOR with full ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v ^ @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_fetch_xor() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_xor(int i, atomic_t *v) { #if defined(arch_atomic_fetch_xor) return arch_atomic_fetch_xor(i, v); #elif defined(arch_atomic_fetch_xor_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_fetch_xor_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic_fetch_xor" #endif } /** * raw_atomic_fetch_xor_acquire() - atomic bitwise XOR with acquire ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v ^ @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_fetch_xor_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_xor_acquire(int i, atomic_t *v) { #if defined(arch_atomic_fetch_xor_acquire) return arch_atomic_fetch_xor_acquire(i, v); #elif defined(arch_atomic_fetch_xor_relaxed) int ret = arch_atomic_fetch_xor_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_fetch_xor) return arch_atomic_fetch_xor(i, v); #else #error "Unable to define raw_atomic_fetch_xor_acquire" #endif } /** * raw_atomic_fetch_xor_release() - atomic bitwise XOR with release ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v ^ @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_fetch_xor_release() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_xor_release(int i, atomic_t *v) { #if defined(arch_atomic_fetch_xor_release) return arch_atomic_fetch_xor_release(i, v); #elif defined(arch_atomic_fetch_xor_relaxed) __atomic_release_fence(); return arch_atomic_fetch_xor_relaxed(i, v); #elif defined(arch_atomic_fetch_xor) return arch_atomic_fetch_xor(i, v); #else #error "Unable to define raw_atomic_fetch_xor_release" #endif } /** * raw_atomic_fetch_xor_relaxed() - atomic bitwise XOR with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v ^ @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_fetch_xor_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_xor_relaxed(int i, atomic_t *v) { #if defined(arch_atomic_fetch_xor_relaxed) return arch_atomic_fetch_xor_relaxed(i, v); #elif defined(arch_atomic_fetch_xor) return arch_atomic_fetch_xor(i, v); #else #error "Unable to define raw_atomic_fetch_xor_relaxed" #endif } /** * raw_atomic_xchg() - atomic exchange with full ordering * @v: pointer to atomic_t * @new: int value to assign * * Atomically updates @v to @new with full ordering. * * Safe to use in noinstr code; prefer atomic_xchg() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_xchg(atomic_t *v, int new) { #if defined(arch_atomic_xchg) return arch_atomic_xchg(v, new); #elif defined(arch_atomic_xchg_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_xchg_relaxed(v, new); __atomic_post_full_fence(); return ret; #else return raw_xchg(&v->counter, new); #endif } /** * raw_atomic_xchg_acquire() - atomic exchange with acquire ordering * @v: pointer to atomic_t * @new: int value to assign * * Atomically updates @v to @new with acquire ordering. * * Safe to use in noinstr code; prefer atomic_xchg_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_xchg_acquire(atomic_t *v, int new) { #if defined(arch_atomic_xchg_acquire) return arch_atomic_xchg_acquire(v, new); #elif defined(arch_atomic_xchg_relaxed) int ret = arch_atomic_xchg_relaxed(v, new); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_xchg) return arch_atomic_xchg(v, new); #else return raw_xchg_acquire(&v->counter, new); #endif } /** * raw_atomic_xchg_release() - atomic exchange with release ordering * @v: pointer to atomic_t * @new: int value to assign * * Atomically updates @v to @new with release ordering. * * Safe to use in noinstr code; prefer atomic_xchg_release() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_xchg_release(atomic_t *v, int new) { #if defined(arch_atomic_xchg_release) return arch_atomic_xchg_release(v, new); #elif defined(arch_atomic_xchg_relaxed) __atomic_release_fence(); return arch_atomic_xchg_relaxed(v, new); #elif defined(arch_atomic_xchg) return arch_atomic_xchg(v, new); #else return raw_xchg_release(&v->counter, new); #endif } /** * raw_atomic_xchg_relaxed() - atomic exchange with relaxed ordering * @v: pointer to atomic_t * @new: int value to assign * * Atomically updates @v to @new with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_xchg_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_xchg_relaxed(atomic_t *v, int new) { #if defined(arch_atomic_xchg_relaxed) return arch_atomic_xchg_relaxed(v, new); #elif defined(arch_atomic_xchg) return arch_atomic_xchg(v, new); #else return raw_xchg_relaxed(&v->counter, new); #endif } /** * raw_atomic_cmpxchg() - atomic compare and exchange with full ordering * @v: pointer to atomic_t * @old: int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_cmpxchg() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_cmpxchg(atomic_t *v, int old, int new) { #if defined(arch_atomic_cmpxchg) return arch_atomic_cmpxchg(v, old, new); #elif defined(arch_atomic_cmpxchg_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_cmpxchg_relaxed(v, old, new); __atomic_post_full_fence(); return ret; #else return raw_cmpxchg(&v->counter, old, new); #endif } /** * raw_atomic_cmpxchg_acquire() - atomic compare and exchange with acquire ordering * @v: pointer to atomic_t * @old: int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_cmpxchg_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_cmpxchg_acquire(atomic_t *v, int old, int new) { #if defined(arch_atomic_cmpxchg_acquire) return arch_atomic_cmpxchg_acquire(v, old, new); #elif defined(arch_atomic_cmpxchg_relaxed) int ret = arch_atomic_cmpxchg_relaxed(v, old, new); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_cmpxchg) return arch_atomic_cmpxchg(v, old, new); #else return raw_cmpxchg_acquire(&v->counter, old, new); #endif } /** * raw_atomic_cmpxchg_release() - atomic compare and exchange with release ordering * @v: pointer to atomic_t * @old: int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_cmpxchg_release() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_cmpxchg_release(atomic_t *v, int old, int new) { #if defined(arch_atomic_cmpxchg_release) return arch_atomic_cmpxchg_release(v, old, new); #elif defined(arch_atomic_cmpxchg_relaxed) __atomic_release_fence(); return arch_atomic_cmpxchg_relaxed(v, old, new); #elif defined(arch_atomic_cmpxchg) return arch_atomic_cmpxchg(v, old, new); #else return raw_cmpxchg_release(&v->counter, old, new); #endif } /** * raw_atomic_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering * @v: pointer to atomic_t * @old: int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_cmpxchg_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_cmpxchg_relaxed(atomic_t *v, int old, int new) { #if defined(arch_atomic_cmpxchg_relaxed) return arch_atomic_cmpxchg_relaxed(v, old, new); #elif defined(arch_atomic_cmpxchg) return arch_atomic_cmpxchg(v, old, new); #else return raw_cmpxchg_relaxed(&v->counter, old, new); #endif } /** * raw_atomic_try_cmpxchg() - atomic compare and exchange with full ordering * @v: pointer to atomic_t * @old: pointer to int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_try_cmpxchg() elsewhere. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool raw_atomic_try_cmpxchg(atomic_t *v, int *old, int new) { #if defined(arch_atomic_try_cmpxchg) return arch_atomic_try_cmpxchg(v, old, new); #elif defined(arch_atomic_try_cmpxchg_relaxed) bool ret; __atomic_pre_full_fence(); ret = arch_atomic_try_cmpxchg_relaxed(v, old, new); __atomic_post_full_fence(); return ret; #else int r, o = *old; r = raw_atomic_cmpxchg(v, o, new); if (unlikely(r != o)) *old = r; return likely(r == o); #endif } /** * raw_atomic_try_cmpxchg_acquire() - atomic compare and exchange with acquire ordering * @v: pointer to atomic_t * @old: pointer to int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_try_cmpxchg_acquire() elsewhere. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool raw_atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new) { #if defined(arch_atomic_try_cmpxchg_acquire) return arch_atomic_try_cmpxchg_acquire(v, old, new); #elif defined(arch_atomic_try_cmpxchg_relaxed) bool ret = arch_atomic_try_cmpxchg_relaxed(v, old, new); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_try_cmpxchg) return arch_atomic_try_cmpxchg(v, old, new); #else int r, o = *old; r = raw_atomic_cmpxchg_acquire(v, o, new); if (unlikely(r != o)) *old = r; return likely(r == o); #endif } /** * raw_atomic_try_cmpxchg_release() - atomic compare and exchange with release ordering * @v: pointer to atomic_t * @old: pointer to int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_try_cmpxchg_release() elsewhere. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool raw_atomic_try_cmpxchg_release(atomic_t *v, int *old, int new) { #if defined(arch_atomic_try_cmpxchg_release) return arch_atomic_try_cmpxchg_release(v, old, new); #elif defined(arch_atomic_try_cmpxchg_relaxed) __atomic_release_fence(); return arch_atomic_try_cmpxchg_relaxed(v, old, new); #elif defined(arch_atomic_try_cmpxchg) return arch_atomic_try_cmpxchg(v, old, new); #else int r, o = *old; r = raw_atomic_cmpxchg_release(v, o, new); if (unlikely(r != o)) *old = r; return likely(r == o); #endif } /** * raw_atomic_try_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering * @v: pointer to atomic_t * @old: pointer to int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_try_cmpxchg_relaxed() elsewhere. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool raw_atomic_try_cmpxchg_relaxed(atomic_t *v, int *old, int new) { #if defined(arch_atomic_try_cmpxchg_relaxed) return arch_atomic_try_cmpxchg_relaxed(v, old, new); #elif defined(arch_atomic_try_cmpxchg) return arch_atomic_try_cmpxchg(v, old, new); #else int r, o = *old; r = raw_atomic_cmpxchg_relaxed(v, o, new); if (unlikely(r != o)) *old = r; return likely(r == o); #endif } /** * raw_atomic_sub_and_test() - atomic subtract and test if zero with full ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_sub_and_test() elsewhere. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool raw_atomic_sub_and_test(int i, atomic_t *v) { #if defined(arch_atomic_sub_and_test) return arch_atomic_sub_and_test(i, v); #else return raw_atomic_sub_return(i, v) == 0; #endif } /** * raw_atomic_dec_and_test() - atomic decrement and test if zero with full ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_dec_and_test() elsewhere. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool raw_atomic_dec_and_test(atomic_t *v) { #if defined(arch_atomic_dec_and_test) return arch_atomic_dec_and_test(v); #else return raw_atomic_dec_return(v) == 0; #endif } /** * raw_atomic_inc_and_test() - atomic increment and test if zero with full ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_inc_and_test() elsewhere. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool raw_atomic_inc_and_test(atomic_t *v) { #if defined(arch_atomic_inc_and_test) return arch_atomic_inc_and_test(v); #else return raw_atomic_inc_return(v) == 0; #endif } /** * raw_atomic_add_negative() - atomic add and test if negative with full ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_add_negative() elsewhere. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool raw_atomic_add_negative(int i, atomic_t *v) { #if defined(arch_atomic_add_negative) return arch_atomic_add_negative(i, v); #elif defined(arch_atomic_add_negative_relaxed) bool ret; __atomic_pre_full_fence(); ret = arch_atomic_add_negative_relaxed(i, v); __atomic_post_full_fence(); return ret; #else return raw_atomic_add_return(i, v) < 0; #endif } /** * raw_atomic_add_negative_acquire() - atomic add and test if negative with acquire ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_add_negative_acquire() elsewhere. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool raw_atomic_add_negative_acquire(int i, atomic_t *v) { #if defined(arch_atomic_add_negative_acquire) return arch_atomic_add_negative_acquire(i, v); #elif defined(arch_atomic_add_negative_relaxed) bool ret = arch_atomic_add_negative_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_add_negative) return arch_atomic_add_negative(i, v); #else return raw_atomic_add_return_acquire(i, v) < 0; #endif } /** * raw_atomic_add_negative_release() - atomic add and test if negative with release ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_add_negative_release() elsewhere. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool raw_atomic_add_negative_release(int i, atomic_t *v) { #if defined(arch_atomic_add_negative_release) return arch_atomic_add_negative_release(i, v); #elif defined(arch_atomic_add_negative_relaxed) __atomic_release_fence(); return arch_atomic_add_negative_relaxed(i, v); #elif defined(arch_atomic_add_negative) return arch_atomic_add_negative(i, v); #else return raw_atomic_add_return_release(i, v) < 0; #endif } /** * raw_atomic_add_negative_relaxed() - atomic add and test if negative with relaxed ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_add_negative_relaxed() elsewhere. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool raw_atomic_add_negative_relaxed(int i, atomic_t *v) { #if defined(arch_atomic_add_negative_relaxed) return arch_atomic_add_negative_relaxed(i, v); #elif defined(arch_atomic_add_negative) return arch_atomic_add_negative(i, v); #else return raw_atomic_add_return_relaxed(i, v) < 0; #endif } /** * raw_atomic_fetch_add_unless() - atomic add unless value with full ordering * @v: pointer to atomic_t * @a: int value to add * @u: int value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_fetch_add_unless() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_add_unless(atomic_t *v, int a, int u) { #if defined(arch_atomic_fetch_add_unless) return arch_atomic_fetch_add_unless(v, a, u); #else int c = raw_atomic_read(v); do { if (unlikely(c == u)) break; } while (!raw_atomic_try_cmpxchg(v, &c, c + a)); return c; #endif } /** * raw_atomic_add_unless() - atomic add unless value with full ordering * @v: pointer to atomic_t * @a: int value to add * @u: int value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_add_unless() elsewhere. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool raw_atomic_add_unless(atomic_t *v, int a, int u) { #if defined(arch_atomic_add_unless) return arch_atomic_add_unless(v, a, u); #else return raw_atomic_fetch_add_unless(v, a, u) != u; #endif } /** * raw_atomic_inc_not_zero() - atomic increment unless zero with full ordering * @v: pointer to atomic_t * * If (@v != 0), atomically updates @v to (@v + 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_inc_not_zero() elsewhere. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool raw_atomic_inc_not_zero(atomic_t *v) { #if defined(arch_atomic_inc_not_zero) return arch_atomic_inc_not_zero(v); #else return raw_atomic_add_unless(v, 1, 0); #endif } /** * raw_atomic_inc_unless_negative() - atomic increment unless negative with full ordering * @v: pointer to atomic_t * * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_inc_unless_negative() elsewhere. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool raw_atomic_inc_unless_negative(atomic_t *v) { #if defined(arch_atomic_inc_unless_negative) return arch_atomic_inc_unless_negative(v); #else int c = raw_atomic_read(v); do { if (unlikely(c < 0)) return false; } while (!raw_atomic_try_cmpxchg(v, &c, c + 1)); return true; #endif } /** * raw_atomic_dec_unless_positive() - atomic decrement unless positive with full ordering * @v: pointer to atomic_t * * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_dec_unless_positive() elsewhere. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool raw_atomic_dec_unless_positive(atomic_t *v) { #if defined(arch_atomic_dec_unless_positive) return arch_atomic_dec_unless_positive(v); #else int c = raw_atomic_read(v); do { if (unlikely(c > 0)) return false; } while (!raw_atomic_try_cmpxchg(v, &c, c - 1)); return true; #endif } /** * raw_atomic_dec_if_positive() - atomic decrement if positive with full ordering * @v: pointer to atomic_t * * If (@v > 0), atomically updates @v to (@v - 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_dec_if_positive() elsewhere. * * Return: The old value of (@v - 1), regardless of whether @v was updated. */ static __always_inline int raw_atomic_dec_if_positive(atomic_t *v) { #if defined(arch_atomic_dec_if_positive) return arch_atomic_dec_if_positive(v); #else int dec, c = raw_atomic_read(v); do { dec = c - 1; if (unlikely(dec < 0)) break; } while (!raw_atomic_try_cmpxchg(v, &c, dec)); return dec; #endif } #ifdef CONFIG_GENERIC_ATOMIC64 #include <asm-generic/atomic64.h> #endif /** * raw_atomic64_read() - atomic load with relaxed ordering * @v: pointer to atomic64_t * * Atomically loads the value of @v with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_read() elsewhere. * * Return: The value loaded from @v. */ static __always_inline s64 raw_atomic64_read(const atomic64_t *v) { return arch_atomic64_read(v); } /** * raw_atomic64_read_acquire() - atomic load with acquire ordering * @v: pointer to atomic64_t * * Atomically loads the value of @v with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_read_acquire() elsewhere. * * Return: The value loaded from @v. */ static __always_inline s64 raw_atomic64_read_acquire(const atomic64_t *v) { #if defined(arch_atomic64_read_acquire) return arch_atomic64_read_acquire(v); #else s64 ret; if (__native_word(atomic64_t)) { ret = smp_load_acquire(&(v)->counter); } else { ret = raw_atomic64_read(v); __atomic_acquire_fence(); } return ret; #endif } /** * raw_atomic64_set() - atomic set with relaxed ordering * @v: pointer to atomic64_t * @i: s64 value to assign * * Atomically sets @v to @i with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_set() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic64_set(atomic64_t *v, s64 i) { arch_atomic64_set(v, i); } /** * raw_atomic64_set_release() - atomic set with release ordering * @v: pointer to atomic64_t * @i: s64 value to assign * * Atomically sets @v to @i with release ordering. * * Safe to use in noinstr code; prefer atomic64_set_release() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic64_set_release(atomic64_t *v, s64 i) { #if defined(arch_atomic64_set_release) arch_atomic64_set_release(v, i); #else if (__native_word(atomic64_t)) { smp_store_release(&(v)->counter, i); } else { __atomic_release_fence(); raw_atomic64_set(v, i); } #endif } /** * raw_atomic64_add() - atomic add with relaxed ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_add() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic64_add(s64 i, atomic64_t *v) { arch_atomic64_add(i, v); } /** * raw_atomic64_add_return() - atomic add with full ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with full ordering. * * Safe to use in noinstr code; prefer atomic64_add_return() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_add_return(s64 i, atomic64_t *v) { #if defined(arch_atomic64_add_return) return arch_atomic64_add_return(i, v); #elif defined(arch_atomic64_add_return_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_add_return_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic64_add_return" #endif } /** * raw_atomic64_add_return_acquire() - atomic add with acquire ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_add_return_acquire() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_add_return_acquire(s64 i, atomic64_t *v) { #if defined(arch_atomic64_add_return_acquire) return arch_atomic64_add_return_acquire(i, v); #elif defined(arch_atomic64_add_return_relaxed) s64 ret = arch_atomic64_add_return_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_add_return) return arch_atomic64_add_return(i, v); #else #error "Unable to define raw_atomic64_add_return_acquire" #endif } /** * raw_atomic64_add_return_release() - atomic add with release ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with release ordering. * * Safe to use in noinstr code; prefer atomic64_add_return_release() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_add_return_release(s64 i, atomic64_t *v) { #if defined(arch_atomic64_add_return_release) return arch_atomic64_add_return_release(i, v); #elif defined(arch_atomic64_add_return_relaxed) __atomic_release_fence(); return arch_atomic64_add_return_relaxed(i, v); #elif defined(arch_atomic64_add_return) return arch_atomic64_add_return(i, v); #else #error "Unable to define raw_atomic64_add_return_release" #endif } /** * raw_atomic64_add_return_relaxed() - atomic add with relaxed ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_add_return_relaxed() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_add_return_relaxed(s64 i, atomic64_t *v) { #if defined(arch_atomic64_add_return_relaxed) return arch_atomic64_add_return_relaxed(i, v); #elif defined(arch_atomic64_add_return) return arch_atomic64_add_return(i, v); #else #error "Unable to define raw_atomic64_add_return_relaxed" #endif } /** * raw_atomic64_fetch_add() - atomic add with full ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with full ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_add() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_add(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_add) return arch_atomic64_fetch_add(i, v); #elif defined(arch_atomic64_fetch_add_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_fetch_add_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic64_fetch_add" #endif } /** * raw_atomic64_fetch_add_acquire() - atomic add with acquire ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_add_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_add_acquire(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_add_acquire) return arch_atomic64_fetch_add_acquire(i, v); #elif defined(arch_atomic64_fetch_add_relaxed) s64 ret = arch_atomic64_fetch_add_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_fetch_add) return arch_atomic64_fetch_add(i, v); #else #error "Unable to define raw_atomic64_fetch_add_acquire" #endif } /** * raw_atomic64_fetch_add_release() - atomic add with release ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with release ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_add_release() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_add_release(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_add_release) return arch_atomic64_fetch_add_release(i, v); #elif defined(arch_atomic64_fetch_add_relaxed) __atomic_release_fence(); return arch_atomic64_fetch_add_relaxed(i, v); #elif defined(arch_atomic64_fetch_add) return arch_atomic64_fetch_add(i, v); #else #error "Unable to define raw_atomic64_fetch_add_release" #endif } /** * raw_atomic64_fetch_add_relaxed() - atomic add with relaxed ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_add_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_add_relaxed(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_add_relaxed) return arch_atomic64_fetch_add_relaxed(i, v); #elif defined(arch_atomic64_fetch_add) return arch_atomic64_fetch_add(i, v); #else #error "Unable to define raw_atomic64_fetch_add_relaxed" #endif } /** * raw_atomic64_sub() - atomic subtract with relaxed ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_sub() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic64_sub(s64 i, atomic64_t *v) { arch_atomic64_sub(i, v); } /** * raw_atomic64_sub_return() - atomic subtract with full ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with full ordering. * * Safe to use in noinstr code; prefer atomic64_sub_return() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_sub_return(s64 i, atomic64_t *v) { #if defined(arch_atomic64_sub_return) return arch_atomic64_sub_return(i, v); #elif defined(arch_atomic64_sub_return_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_sub_return_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic64_sub_return" #endif } /** * raw_atomic64_sub_return_acquire() - atomic subtract with acquire ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_sub_return_acquire() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_sub_return_acquire(s64 i, atomic64_t *v) { #if defined(arch_atomic64_sub_return_acquire) return arch_atomic64_sub_return_acquire(i, v); #elif defined(arch_atomic64_sub_return_relaxed) s64 ret = arch_atomic64_sub_return_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_sub_return) return arch_atomic64_sub_return(i, v); #else #error "Unable to define raw_atomic64_sub_return_acquire" #endif } /** * raw_atomic64_sub_return_release() - atomic subtract with release ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with release ordering. * * Safe to use in noinstr code; prefer atomic64_sub_return_release() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_sub_return_release(s64 i, atomic64_t *v) { #if defined(arch_atomic64_sub_return_release) return arch_atomic64_sub_return_release(i, v); #elif defined(arch_atomic64_sub_return_relaxed) __atomic_release_fence(); return arch_atomic64_sub_return_relaxed(i, v); #elif defined(arch_atomic64_sub_return) return arch_atomic64_sub_return(i, v); #else #error "Unable to define raw_atomic64_sub_return_release" #endif } /** * raw_atomic64_sub_return_relaxed() - atomic subtract with relaxed ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_sub_return_relaxed() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_sub_return_relaxed(s64 i, atomic64_t *v) { #if defined(arch_atomic64_sub_return_relaxed) return arch_atomic64_sub_return_relaxed(i, v); #elif defined(arch_atomic64_sub_return) return arch_atomic64_sub_return(i, v); #else #error "Unable to define raw_atomic64_sub_return_relaxed" #endif } /** * raw_atomic64_fetch_sub() - atomic subtract with full ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with full ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_sub() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_sub(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_sub) return arch_atomic64_fetch_sub(i, v); #elif defined(arch_atomic64_fetch_sub_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_fetch_sub_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic64_fetch_sub" #endif } /** * raw_atomic64_fetch_sub_acquire() - atomic subtract with acquire ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_sub_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_sub_acquire(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_sub_acquire) return arch_atomic64_fetch_sub_acquire(i, v); #elif defined(arch_atomic64_fetch_sub_relaxed) s64 ret = arch_atomic64_fetch_sub_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_fetch_sub) return arch_atomic64_fetch_sub(i, v); #else #error "Unable to define raw_atomic64_fetch_sub_acquire" #endif } /** * raw_atomic64_fetch_sub_release() - atomic subtract with release ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with release ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_sub_release() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_sub_release(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_sub_release) return arch_atomic64_fetch_sub_release(i, v); #elif defined(arch_atomic64_fetch_sub_relaxed) __atomic_release_fence(); return arch_atomic64_fetch_sub_relaxed(i, v); #elif defined(arch_atomic64_fetch_sub) return arch_atomic64_fetch_sub(i, v); #else #error "Unable to define raw_atomic64_fetch_sub_release" #endif } /** * raw_atomic64_fetch_sub_relaxed() - atomic subtract with relaxed ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_sub_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_sub_relaxed(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_sub_relaxed) return arch_atomic64_fetch_sub_relaxed(i, v); #elif defined(arch_atomic64_fetch_sub) return arch_atomic64_fetch_sub(i, v); #else #error "Unable to define raw_atomic64_fetch_sub_relaxed" #endif } /** * raw_atomic64_inc() - atomic increment with relaxed ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_inc() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic64_inc(atomic64_t *v) { #if defined(arch_atomic64_inc) arch_atomic64_inc(v); #else raw_atomic64_add(1, v); #endif } /** * raw_atomic64_inc_return() - atomic increment with full ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with full ordering. * * Safe to use in noinstr code; prefer atomic64_inc_return() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_inc_return(atomic64_t *v) { #if defined(arch_atomic64_inc_return) return arch_atomic64_inc_return(v); #elif defined(arch_atomic64_inc_return_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_inc_return_relaxed(v); __atomic_post_full_fence(); return ret; #else return raw_atomic64_add_return(1, v); #endif } /** * raw_atomic64_inc_return_acquire() - atomic increment with acquire ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_inc_return_acquire() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_inc_return_acquire(atomic64_t *v) { #if defined(arch_atomic64_inc_return_acquire) return arch_atomic64_inc_return_acquire(v); #elif defined(arch_atomic64_inc_return_relaxed) s64 ret = arch_atomic64_inc_return_relaxed(v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_inc_return) return arch_atomic64_inc_return(v); #else return raw_atomic64_add_return_acquire(1, v); #endif } /** * raw_atomic64_inc_return_release() - atomic increment with release ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with release ordering. * * Safe to use in noinstr code; prefer atomic64_inc_return_release() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_inc_return_release(atomic64_t *v) { #if defined(arch_atomic64_inc_return_release) return arch_atomic64_inc_return_release(v); #elif defined(arch_atomic64_inc_return_relaxed) __atomic_release_fence(); return arch_atomic64_inc_return_relaxed(v); #elif defined(arch_atomic64_inc_return) return arch_atomic64_inc_return(v); #else return raw_atomic64_add_return_release(1, v); #endif } /** * raw_atomic64_inc_return_relaxed() - atomic increment with relaxed ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_inc_return_relaxed() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_inc_return_relaxed(atomic64_t *v) { #if defined(arch_atomic64_inc_return_relaxed) return arch_atomic64_inc_return_relaxed(v); #elif defined(arch_atomic64_inc_return) return arch_atomic64_inc_return(v); #else return raw_atomic64_add_return_relaxed(1, v); #endif } /** * raw_atomic64_fetch_inc() - atomic increment with full ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with full ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_inc() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_inc(atomic64_t *v) { #if defined(arch_atomic64_fetch_inc) return arch_atomic64_fetch_inc(v); #elif defined(arch_atomic64_fetch_inc_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_fetch_inc_relaxed(v); __atomic_post_full_fence(); return ret; #else return raw_atomic64_fetch_add(1, v); #endif } /** * raw_atomic64_fetch_inc_acquire() - atomic increment with acquire ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_inc_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_inc_acquire(atomic64_t *v) { #if defined(arch_atomic64_fetch_inc_acquire) return arch_atomic64_fetch_inc_acquire(v); #elif defined(arch_atomic64_fetch_inc_relaxed) s64 ret = arch_atomic64_fetch_inc_relaxed(v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_fetch_inc) return arch_atomic64_fetch_inc(v); #else return raw_atomic64_fetch_add_acquire(1, v); #endif } /** * raw_atomic64_fetch_inc_release() - atomic increment with release ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with release ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_inc_release() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_inc_release(atomic64_t *v) { #if defined(arch_atomic64_fetch_inc_release) return arch_atomic64_fetch_inc_release(v); #elif defined(arch_atomic64_fetch_inc_relaxed) __atomic_release_fence(); return arch_atomic64_fetch_inc_relaxed(v); #elif defined(arch_atomic64_fetch_inc) return arch_atomic64_fetch_inc(v); #else return raw_atomic64_fetch_add_release(1, v); #endif } /** * raw_atomic64_fetch_inc_relaxed() - atomic increment with relaxed ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_inc_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_inc_relaxed(atomic64_t *v) { #if defined(arch_atomic64_fetch_inc_relaxed) return arch_atomic64_fetch_inc_relaxed(v); #elif defined(arch_atomic64_fetch_inc) return arch_atomic64_fetch_inc(v); #else return raw_atomic64_fetch_add_relaxed(1, v); #endif } /** * raw_atomic64_dec() - atomic decrement with relaxed ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_dec() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic64_dec(atomic64_t *v) { #if defined(arch_atomic64_dec) arch_atomic64_dec(v); #else raw_atomic64_sub(1, v); #endif } /** * raw_atomic64_dec_return() - atomic decrement with full ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with full ordering. * * Safe to use in noinstr code; prefer atomic64_dec_return() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_dec_return(atomic64_t *v) { #if defined(arch_atomic64_dec_return) return arch_atomic64_dec_return(v); #elif defined(arch_atomic64_dec_return_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_dec_return_relaxed(v); __atomic_post_full_fence(); return ret; #else return raw_atomic64_sub_return(1, v); #endif } /** * raw_atomic64_dec_return_acquire() - atomic decrement with acquire ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_dec_return_acquire() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_dec_return_acquire(atomic64_t *v) { #if defined(arch_atomic64_dec_return_acquire) return arch_atomic64_dec_return_acquire(v); #elif defined(arch_atomic64_dec_return_relaxed) s64 ret = arch_atomic64_dec_return_relaxed(v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_dec_return) return arch_atomic64_dec_return(v); #else return raw_atomic64_sub_return_acquire(1, v); #endif } /** * raw_atomic64_dec_return_release() - atomic decrement with release ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with release ordering. * * Safe to use in noinstr code; prefer atomic64_dec_return_release() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_dec_return_release(atomic64_t *v) { #if defined(arch_atomic64_dec_return_release) return arch_atomic64_dec_return_release(v); #elif defined(arch_atomic64_dec_return_relaxed) __atomic_release_fence(); return arch_atomic64_dec_return_relaxed(v); #elif defined(arch_atomic64_dec_return) return arch_atomic64_dec_return(v); #else return raw_atomic64_sub_return_release(1, v); #endif } /** * raw_atomic64_dec_return_relaxed() - atomic decrement with relaxed ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_dec_return_relaxed() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_dec_return_relaxed(atomic64_t *v) { #if defined(arch_atomic64_dec_return_relaxed) return arch_atomic64_dec_return_relaxed(v); #elif defined(arch_atomic64_dec_return) return arch_atomic64_dec_return(v); #else return raw_atomic64_sub_return_relaxed(1, v); #endif } /** * raw_atomic64_fetch_dec() - atomic decrement with full ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with full ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_dec() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_dec(atomic64_t *v) { #if defined(arch_atomic64_fetch_dec) return arch_atomic64_fetch_dec(v); #elif defined(arch_atomic64_fetch_dec_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_fetch_dec_relaxed(v); __atomic_post_full_fence(); return ret; #else return raw_atomic64_fetch_sub(1, v); #endif } /** * raw_atomic64_fetch_dec_acquire() - atomic decrement with acquire ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_dec_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_dec_acquire(atomic64_t *v) { #if defined(arch_atomic64_fetch_dec_acquire) return arch_atomic64_fetch_dec_acquire(v); #elif defined(arch_atomic64_fetch_dec_relaxed) s64 ret = arch_atomic64_fetch_dec_relaxed(v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_fetch_dec) return arch_atomic64_fetch_dec(v); #else return raw_atomic64_fetch_sub_acquire(1, v); #endif } /** * raw_atomic64_fetch_dec_release() - atomic decrement with release ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with release ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_dec_release() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_dec_release(atomic64_t *v) { #if defined(arch_atomic64_fetch_dec_release) return arch_atomic64_fetch_dec_release(v); #elif defined(arch_atomic64_fetch_dec_relaxed) __atomic_release_fence(); return arch_atomic64_fetch_dec_relaxed(v); #elif defined(arch_atomic64_fetch_dec) return arch_atomic64_fetch_dec(v); #else return raw_atomic64_fetch_sub_release(1, v); #endif } /** * raw_atomic64_fetch_dec_relaxed() - atomic decrement with relaxed ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_dec_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_dec_relaxed(atomic64_t *v) { #if defined(arch_atomic64_fetch_dec_relaxed) return arch_atomic64_fetch_dec_relaxed(v); #elif defined(arch_atomic64_fetch_dec) return arch_atomic64_fetch_dec(v); #else return raw_atomic64_fetch_sub_relaxed(1, v); #endif } /** * raw_atomic64_and() - atomic bitwise AND with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_and() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic64_and(s64 i, atomic64_t *v) { arch_atomic64_and(i, v); } /** * raw_atomic64_fetch_and() - atomic bitwise AND with full ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & @i) with full ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_and() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_and(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_and) return arch_atomic64_fetch_and(i, v); #elif defined(arch_atomic64_fetch_and_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_fetch_and_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic64_fetch_and" #endif } /** * raw_atomic64_fetch_and_acquire() - atomic bitwise AND with acquire ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_and_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_and_acquire(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_and_acquire) return arch_atomic64_fetch_and_acquire(i, v); #elif defined(arch_atomic64_fetch_and_relaxed) s64 ret = arch_atomic64_fetch_and_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_fetch_and) return arch_atomic64_fetch_and(i, v); #else #error "Unable to define raw_atomic64_fetch_and_acquire" #endif } /** * raw_atomic64_fetch_and_release() - atomic bitwise AND with release ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & @i) with release ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_and_release() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_and_release(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_and_release) return arch_atomic64_fetch_and_release(i, v); #elif defined(arch_atomic64_fetch_and_relaxed) __atomic_release_fence(); return arch_atomic64_fetch_and_relaxed(i, v); #elif defined(arch_atomic64_fetch_and) return arch_atomic64_fetch_and(i, v); #else #error "Unable to define raw_atomic64_fetch_and_release" #endif } /** * raw_atomic64_fetch_and_relaxed() - atomic bitwise AND with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_and_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_and_relaxed(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_and_relaxed) return arch_atomic64_fetch_and_relaxed(i, v); #elif defined(arch_atomic64_fetch_and) return arch_atomic64_fetch_and(i, v); #else #error "Unable to define raw_atomic64_fetch_and_relaxed" #endif } /** * raw_atomic64_andnot() - atomic bitwise AND NOT with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & ~@i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_andnot() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic64_andnot(s64 i, atomic64_t *v) { #if defined(arch_atomic64_andnot) arch_atomic64_andnot(i, v); #else raw_atomic64_and(~i, v); #endif } /** * raw_atomic64_fetch_andnot() - atomic bitwise AND NOT with full ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & ~@i) with full ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_andnot() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_andnot(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_andnot) return arch_atomic64_fetch_andnot(i, v); #elif defined(arch_atomic64_fetch_andnot_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_fetch_andnot_relaxed(i, v); __atomic_post_full_fence(); return ret; #else return raw_atomic64_fetch_and(~i, v); #endif } /** * raw_atomic64_fetch_andnot_acquire() - atomic bitwise AND NOT with acquire ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & ~@i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_andnot_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_andnot_acquire(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_andnot_acquire) return arch_atomic64_fetch_andnot_acquire(i, v); #elif defined(arch_atomic64_fetch_andnot_relaxed) s64 ret = arch_atomic64_fetch_andnot_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_fetch_andnot) return arch_atomic64_fetch_andnot(i, v); #else return raw_atomic64_fetch_and_acquire(~i, v); #endif } /** * raw_atomic64_fetch_andnot_release() - atomic bitwise AND NOT with release ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & ~@i) with release ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_andnot_release() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_andnot_release(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_andnot_release) return arch_atomic64_fetch_andnot_release(i, v); #elif defined(arch_atomic64_fetch_andnot_relaxed) __atomic_release_fence(); return arch_atomic64_fetch_andnot_relaxed(i, v); #elif defined(arch_atomic64_fetch_andnot) return arch_atomic64_fetch_andnot(i, v); #else return raw_atomic64_fetch_and_release(~i, v); #endif } /** * raw_atomic64_fetch_andnot_relaxed() - atomic bitwise AND NOT with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & ~@i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_andnot_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_andnot_relaxed(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_andnot_relaxed) return arch_atomic64_fetch_andnot_relaxed(i, v); #elif defined(arch_atomic64_fetch_andnot) return arch_atomic64_fetch_andnot(i, v); #else return raw_atomic64_fetch_and_relaxed(~i, v); #endif } /** * raw_atomic64_or() - atomic bitwise OR with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v | @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_or() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic64_or(s64 i, atomic64_t *v) { arch_atomic64_or(i, v); } /** * raw_atomic64_fetch_or() - atomic bitwise OR with full ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v | @i) with full ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_or() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_or(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_or) return arch_atomic64_fetch_or(i, v); #elif defined(arch_atomic64_fetch_or_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_fetch_or_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic64_fetch_or" #endif } /** * raw_atomic64_fetch_or_acquire() - atomic bitwise OR with acquire ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v | @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_or_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_or_acquire(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_or_acquire) return arch_atomic64_fetch_or_acquire(i, v); #elif defined(arch_atomic64_fetch_or_relaxed) s64 ret = arch_atomic64_fetch_or_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_fetch_or) return arch_atomic64_fetch_or(i, v); #else #error "Unable to define raw_atomic64_fetch_or_acquire" #endif } /** * raw_atomic64_fetch_or_release() - atomic bitwise OR with release ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v | @i) with release ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_or_release() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_or_release(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_or_release) return arch_atomic64_fetch_or_release(i, v); #elif defined(arch_atomic64_fetch_or_relaxed) __atomic_release_fence(); return arch_atomic64_fetch_or_relaxed(i, v); #elif defined(arch_atomic64_fetch_or) return arch_atomic64_fetch_or(i, v); #else #error "Unable to define raw_atomic64_fetch_or_release" #endif } /** * raw_atomic64_fetch_or_relaxed() - atomic bitwise OR with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v | @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_or_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_or_relaxed(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_or_relaxed) return arch_atomic64_fetch_or_relaxed(i, v); #elif defined(arch_atomic64_fetch_or) return arch_atomic64_fetch_or(i, v); #else #error "Unable to define raw_atomic64_fetch_or_relaxed" #endif } /** * raw_atomic64_xor() - atomic bitwise XOR with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v ^ @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_xor() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic64_xor(s64 i, atomic64_t *v) { arch_atomic64_xor(i, v); } /** * raw_atomic64_fetch_xor() - atomic bitwise XOR with full ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v ^ @i) with full ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_xor() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_xor(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_xor) return arch_atomic64_fetch_xor(i, v); #elif defined(arch_atomic64_fetch_xor_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_fetch_xor_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic64_fetch_xor" #endif } /** * raw_atomic64_fetch_xor_acquire() - atomic bitwise XOR with acquire ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v ^ @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_xor_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_xor_acquire(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_xor_acquire) return arch_atomic64_fetch_xor_acquire(i, v); #elif defined(arch_atomic64_fetch_xor_relaxed) s64 ret = arch_atomic64_fetch_xor_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_fetch_xor) return arch_atomic64_fetch_xor(i, v); #else #error "Unable to define raw_atomic64_fetch_xor_acquire" #endif } /** * raw_atomic64_fetch_xor_release() - atomic bitwise XOR with release ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v ^ @i) with release ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_xor_release() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_xor_release(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_xor_release) return arch_atomic64_fetch_xor_release(i, v); #elif defined(arch_atomic64_fetch_xor_relaxed) __atomic_release_fence(); return arch_atomic64_fetch_xor_relaxed(i, v); #elif defined(arch_atomic64_fetch_xor) return arch_atomic64_fetch_xor(i, v); #else #error "Unable to define raw_atomic64_fetch_xor_release" #endif } /** * raw_atomic64_fetch_xor_relaxed() - atomic bitwise XOR with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v ^ @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_xor_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_xor_relaxed(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_xor_relaxed) return arch_atomic64_fetch_xor_relaxed(i, v); #elif defined(arch_atomic64_fetch_xor) return arch_atomic64_fetch_xor(i, v); #else #error "Unable to define raw_atomic64_fetch_xor_relaxed" #endif } /** * raw_atomic64_xchg() - atomic exchange with full ordering * @v: pointer to atomic64_t * @new: s64 value to assign * * Atomically updates @v to @new with full ordering. * * Safe to use in noinstr code; prefer atomic64_xchg() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_xchg(atomic64_t *v, s64 new) { #if defined(arch_atomic64_xchg) return arch_atomic64_xchg(v, new); #elif defined(arch_atomic64_xchg_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_xchg_relaxed(v, new); __atomic_post_full_fence(); return ret; #else return raw_xchg(&v->counter, new); #endif } /** * raw_atomic64_xchg_acquire() - atomic exchange with acquire ordering * @v: pointer to atomic64_t * @new: s64 value to assign * * Atomically updates @v to @new with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_xchg_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_xchg_acquire(atomic64_t *v, s64 new) { #if defined(arch_atomic64_xchg_acquire) return arch_atomic64_xchg_acquire(v, new); #elif defined(arch_atomic64_xchg_relaxed) s64 ret = arch_atomic64_xchg_relaxed(v, new); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_xchg) return arch_atomic64_xchg(v, new); #else return raw_xchg_acquire(&v->counter, new); #endif } /** * raw_atomic64_xchg_release() - atomic exchange with release ordering * @v: pointer to atomic64_t * @new: s64 value to assign * * Atomically updates @v to @new with release ordering. * * Safe to use in noinstr code; prefer atomic64_xchg_release() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_xchg_release(atomic64_t *v, s64 new) { #if defined(arch_atomic64_xchg_release) return arch_atomic64_xchg_release(v, new); #elif defined(arch_atomic64_xchg_relaxed) __atomic_release_fence(); return arch_atomic64_xchg_relaxed(v, new); #elif defined(arch_atomic64_xchg) return arch_atomic64_xchg(v, new); #else return raw_xchg_release(&v->counter, new); #endif } /** * raw_atomic64_xchg_relaxed() - atomic exchange with relaxed ordering * @v: pointer to atomic64_t * @new: s64 value to assign * * Atomically updates @v to @new with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_xchg_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_xchg_relaxed(atomic64_t *v, s64 new) { #if defined(arch_atomic64_xchg_relaxed) return arch_atomic64_xchg_relaxed(v, new); #elif defined(arch_atomic64_xchg) return arch_atomic64_xchg(v, new); #else return raw_xchg_relaxed(&v->counter, new); #endif } /** * raw_atomic64_cmpxchg() - atomic compare and exchange with full ordering * @v: pointer to atomic64_t * @old: s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_cmpxchg() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) { #if defined(arch_atomic64_cmpxchg) return arch_atomic64_cmpxchg(v, old, new); #elif defined(arch_atomic64_cmpxchg_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_cmpxchg_relaxed(v, old, new); __atomic_post_full_fence(); return ret; #else return raw_cmpxchg(&v->counter, old, new); #endif } /** * raw_atomic64_cmpxchg_acquire() - atomic compare and exchange with acquire ordering * @v: pointer to atomic64_t * @old: s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_cmpxchg_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_cmpxchg_acquire(atomic64_t *v, s64 old, s64 new) { #if defined(arch_atomic64_cmpxchg_acquire) return arch_atomic64_cmpxchg_acquire(v, old, new); #elif defined(arch_atomic64_cmpxchg_relaxed) s64 ret = arch_atomic64_cmpxchg_relaxed(v, old, new); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_cmpxchg) return arch_atomic64_cmpxchg(v, old, new); #else return raw_cmpxchg_acquire(&v->counter, old, new); #endif } /** * raw_atomic64_cmpxchg_release() - atomic compare and exchange with release ordering * @v: pointer to atomic64_t * @old: s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_cmpxchg_release() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_cmpxchg_release(atomic64_t *v, s64 old, s64 new) { #if defined(arch_atomic64_cmpxchg_release) return arch_atomic64_cmpxchg_release(v, old, new); #elif defined(arch_atomic64_cmpxchg_relaxed) __atomic_release_fence(); return arch_atomic64_cmpxchg_relaxed(v, old, new); #elif defined(arch_atomic64_cmpxchg) return arch_atomic64_cmpxchg(v, old, new); #else return raw_cmpxchg_release(&v->counter, old, new); #endif } /** * raw_atomic64_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering * @v: pointer to atomic64_t * @old: s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_cmpxchg_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_cmpxchg_relaxed(atomic64_t *v, s64 old, s64 new) { #if defined(arch_atomic64_cmpxchg_relaxed) return arch_atomic64_cmpxchg_relaxed(v, old, new); #elif defined(arch_atomic64_cmpxchg) return arch_atomic64_cmpxchg(v, old, new); #else return raw_cmpxchg_relaxed(&v->counter, old, new); #endif } /** * raw_atomic64_try_cmpxchg() - atomic compare and exchange with full ordering * @v: pointer to atomic64_t * @old: pointer to s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_try_cmpxchg() elsewhere. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool raw_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new) { #if defined(arch_atomic64_try_cmpxchg) return arch_atomic64_try_cmpxchg(v, old, new); #elif defined(arch_atomic64_try_cmpxchg_relaxed) bool ret; __atomic_pre_full_fence(); ret = arch_atomic64_try_cmpxchg_relaxed(v, old, new); __atomic_post_full_fence(); return ret; #else s64 r, o = *old; r = raw_atomic64_cmpxchg(v, o, new); if (unlikely(r != o)) *old = r; return likely(r == o); #endif } /** * raw_atomic64_try_cmpxchg_acquire() - atomic compare and exchange with acquire ordering * @v: pointer to atomic64_t * @old: pointer to s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_try_cmpxchg_acquire() elsewhere. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool raw_atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new) { #if defined(arch_atomic64_try_cmpxchg_acquire) return arch_atomic64_try_cmpxchg_acquire(v, old, new); #elif defined(arch_atomic64_try_cmpxchg_relaxed) bool ret = arch_atomic64_try_cmpxchg_relaxed(v, old, new); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_try_cmpxchg) return arch_atomic64_try_cmpxchg(v, old, new); #else s64 r, o = *old; r = raw_atomic64_cmpxchg_acquire(v, o, new); if (unlikely(r != o)) *old = r; return likely(r == o); #endif } /** * raw_atomic64_try_cmpxchg_release() - atomic compare and exchange with release ordering * @v: pointer to atomic64_t * @old: pointer to s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_try_cmpxchg_release() elsewhere. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool raw_atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new) { #if defined(arch_atomic64_try_cmpxchg_release) return arch_atomic64_try_cmpxchg_release(v, old, new); #elif defined(arch_atomic64_try_cmpxchg_relaxed) __atomic_release_fence(); return arch_atomic64_try_cmpxchg_relaxed(v, old, new); #elif defined(arch_atomic64_try_cmpxchg) return arch_atomic64_try_cmpxchg(v, old, new); #else s64 r, o = *old; r = raw_atomic64_cmpxchg_release(v, o, new); if (unlikely(r != o)) *old = r; return likely(r == o); #endif } /** * raw_atomic64_try_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering * @v: pointer to atomic64_t * @old: pointer to s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_try_cmpxchg_relaxed() elsewhere. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool raw_atomic64_try_cmpxchg_relaxed(atomic64_t *v, s64 *old, s64 new) { #if defined(arch_atomic64_try_cmpxchg_relaxed) return arch_atomic64_try_cmpxchg_relaxed(v, old, new); #elif defined(arch_atomic64_try_cmpxchg) return arch_atomic64_try_cmpxchg(v, old, new); #else s64 r, o = *old; r = raw_atomic64_cmpxchg_relaxed(v, o, new); if (unlikely(r != o)) *old = r; return likely(r == o); #endif } /** * raw_atomic64_sub_and_test() - atomic subtract and test if zero with full ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with full ordering. * * Safe to use in noinstr code; prefer atomic64_sub_and_test() elsewhere. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool raw_atomic64_sub_and_test(s64 i, atomic64_t *v) { #if defined(arch_atomic64_sub_and_test) return arch_atomic64_sub_and_test(i, v); #else return raw_atomic64_sub_return(i, v) == 0; #endif } /** * raw_atomic64_dec_and_test() - atomic decrement and test if zero with full ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with full ordering. * * Safe to use in noinstr code; prefer atomic64_dec_and_test() elsewhere. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool raw_atomic64_dec_and_test(atomic64_t *v) { #if defined(arch_atomic64_dec_and_test) return arch_atomic64_dec_and_test(v); #else return raw_atomic64_dec_return(v) == 0; #endif } /** * raw_atomic64_inc_and_test() - atomic increment and test if zero with full ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with full ordering. * * Safe to use in noinstr code; prefer atomic64_inc_and_test() elsewhere. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool raw_atomic64_inc_and_test(atomic64_t *v) { #if defined(arch_atomic64_inc_and_test) return arch_atomic64_inc_and_test(v); #else return raw_atomic64_inc_return(v) == 0; #endif } /** * raw_atomic64_add_negative() - atomic add and test if negative with full ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with full ordering. * * Safe to use in noinstr code; prefer atomic64_add_negative() elsewhere. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool raw_atomic64_add_negative(s64 i, atomic64_t *v) { #if defined(arch_atomic64_add_negative) return arch_atomic64_add_negative(i, v); #elif defined(arch_atomic64_add_negative_relaxed) bool ret; __atomic_pre_full_fence(); ret = arch_atomic64_add_negative_relaxed(i, v); __atomic_post_full_fence(); return ret; #else return raw_atomic64_add_return(i, v) < 0; #endif } /** * raw_atomic64_add_negative_acquire() - atomic add and test if negative with acquire ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_add_negative_acquire() elsewhere. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool raw_atomic64_add_negative_acquire(s64 i, atomic64_t *v) { #if defined(arch_atomic64_add_negative_acquire) return arch_atomic64_add_negative_acquire(i, v); #elif defined(arch_atomic64_add_negative_relaxed) bool ret = arch_atomic64_add_negative_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_add_negative) return arch_atomic64_add_negative(i, v); #else return raw_atomic64_add_return_acquire(i, v) < 0; #endif } /** * raw_atomic64_add_negative_release() - atomic add and test if negative with release ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with release ordering. * * Safe to use in noinstr code; prefer atomic64_add_negative_release() elsewhere. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool raw_atomic64_add_negative_release(s64 i, atomic64_t *v) { #if defined(arch_atomic64_add_negative_release) return arch_atomic64_add_negative_release(i, v); #elif defined(arch_atomic64_add_negative_relaxed) __atomic_release_fence(); return arch_atomic64_add_negative_relaxed(i, v); #elif defined(arch_atomic64_add_negative) return arch_atomic64_add_negative(i, v); #else return raw_atomic64_add_return_release(i, v) < 0; #endif } /** * raw_atomic64_add_negative_relaxed() - atomic add and test if negative with relaxed ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_add_negative_relaxed() elsewhere. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool raw_atomic64_add_negative_relaxed(s64 i, atomic64_t *v) { #if defined(arch_atomic64_add_negative_relaxed) return arch_atomic64_add_negative_relaxed(i, v); #elif defined(arch_atomic64_add_negative) return arch_atomic64_add_negative(i, v); #else return raw_atomic64_add_return_relaxed(i, v) < 0; #endif } /** * raw_atomic64_fetch_add_unless() - atomic add unless value with full ordering * @v: pointer to atomic64_t * @a: s64 value to add * @u: s64 value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_fetch_add_unless() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u) { #if defined(arch_atomic64_fetch_add_unless) return arch_atomic64_fetch_add_unless(v, a, u); #else s64 c = raw_atomic64_read(v); do { if (unlikely(c == u)) break; } while (!raw_atomic64_try_cmpxchg(v, &c, c + a)); return c; #endif } /** * raw_atomic64_add_unless() - atomic add unless value with full ordering * @v: pointer to atomic64_t * @a: s64 value to add * @u: s64 value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_add_unless() elsewhere. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool raw_atomic64_add_unless(atomic64_t *v, s64 a, s64 u) { #if defined(arch_atomic64_add_unless) return arch_atomic64_add_unless(v, a, u); #else return raw_atomic64_fetch_add_unless(v, a, u) != u; #endif } /** * raw_atomic64_inc_not_zero() - atomic increment unless zero with full ordering * @v: pointer to atomic64_t * * If (@v != 0), atomically updates @v to (@v + 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_inc_not_zero() elsewhere. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool raw_atomic64_inc_not_zero(atomic64_t *v) { #if defined(arch_atomic64_inc_not_zero) return arch_atomic64_inc_not_zero(v); #else return raw_atomic64_add_unless(v, 1, 0); #endif } /** * raw_atomic64_inc_unless_negative() - atomic increment unless negative with full ordering * @v: pointer to atomic64_t * * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_inc_unless_negative() elsewhere. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool raw_atomic64_inc_unless_negative(atomic64_t *v) { #if defined(arch_atomic64_inc_unless_negative) return arch_atomic64_inc_unless_negative(v); #else s64 c = raw_atomic64_read(v); do { if (unlikely(c < 0)) return false; } while (!raw_atomic64_try_cmpxchg(v, &c, c + 1)); return true; #endif } /** * raw_atomic64_dec_unless_positive() - atomic decrement unless positive with full ordering * @v: pointer to atomic64_t * * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_dec_unless_positive() elsewhere. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool raw_atomic64_dec_unless_positive(atomic64_t *v) { #if defined(arch_atomic64_dec_unless_positive) return arch_atomic64_dec_unless_positive(v); #else s64 c = raw_atomic64_read(v); do { if (unlikely(c > 0)) return false; } while (!raw_atomic64_try_cmpxchg(v, &c, c - 1)); return true; #endif } /** * raw_atomic64_dec_if_positive() - atomic decrement if positive with full ordering * @v: pointer to atomic64_t * * If (@v > 0), atomically updates @v to (@v - 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_dec_if_positive() elsewhere. * * Return: The old value of (@v - 1), regardless of whether @v was updated. */ static __always_inline s64 raw_atomic64_dec_if_positive(atomic64_t *v) { #if defined(arch_atomic64_dec_if_positive) return arch_atomic64_dec_if_positive(v); #else s64 dec, c = raw_atomic64_read(v); do { dec = c - 1; if (unlikely(dec < 0)) break; } while (!raw_atomic64_try_cmpxchg(v, &c, dec)); return dec; #endif } #endif /* _LINUX_ATOMIC_FALLBACK_H */ // b565db590afeeff0d7c9485ccbca5bb6e155749f
5 5 2 3 5 4 2 3 1 1 1 17 17 17 17 6 6 6 6 6 6 6 27 27 27 27 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hpfs/dnode.c * * Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999 * * handling directory dnode tree - adding, deleteing & searching for dirents */ #include "hpfs_fn.h" static loff_t get_pos(struct dnode *d, struct hpfs_dirent *fde) { struct hpfs_dirent *de; struct hpfs_dirent *de_end = dnode_end_de(d); int i = 1; for (de = dnode_first_de(d); de < de_end; de = de_next_de(de)) { if (de == fde) return ((loff_t) le32_to_cpu(d->self) << 4) | (loff_t)i; i++; } pr_info("%s(): not_found\n", __func__); return ((loff_t)le32_to_cpu(d->self) << 4) | (loff_t)1; } int hpfs_add_pos(struct inode *inode, loff_t *pos) { struct hpfs_inode_info *hpfs_inode = hpfs_i(inode); int i = 0; loff_t **ppos; if (hpfs_inode->i_rddir_off) for (; hpfs_inode->i_rddir_off[i]; i++) if (hpfs_inode->i_rddir_off[i] == pos) return 0; if (!(i&0x0f)) { ppos = kmalloc_array(i + 0x11, sizeof(loff_t *), GFP_NOFS); if (!ppos) { pr_err("out of memory for position list\n"); return -ENOMEM; } if (hpfs_inode->i_rddir_off) { memcpy(ppos, hpfs_inode->i_rddir_off, i * sizeof(loff_t)); kfree(hpfs_inode->i_rddir_off); } hpfs_inode->i_rddir_off = ppos; } hpfs_inode->i_rddir_off[i] = pos; hpfs_inode->i_rddir_off[i + 1] = NULL; return 0; } void hpfs_del_pos(struct inode *inode, loff_t *pos) { struct hpfs_inode_info *hpfs_inode = hpfs_i(inode); loff_t **i, **j; if (!hpfs_inode->i_rddir_off) goto not_f; for (i = hpfs_inode->i_rddir_off; *i; i++) if (*i == pos) goto fnd; goto not_f; fnd: for (j = i + 1; *j; j++) ; *i = *(j - 1); *(j - 1) = NULL; if (j - 1 == hpfs_inode->i_rddir_off) { kfree(hpfs_inode->i_rddir_off); hpfs_inode->i_rddir_off = NULL; } return; not_f: /*pr_warn("position pointer %p->%08x not found\n", pos, (int)*pos);*/ return; } static void for_all_poss(struct inode *inode, void (*f)(loff_t *, loff_t, loff_t), loff_t p1, loff_t p2) { struct hpfs_inode_info *hpfs_inode = hpfs_i(inode); loff_t **i; if (!hpfs_inode->i_rddir_off) return; for (i = hpfs_inode->i_rddir_off; *i; i++) (*f)(*i, p1, p2); return; } static void hpfs_pos_subst(loff_t *p, loff_t f, loff_t t) { if (*p == f) *p = t; } /*void hpfs_hpfs_pos_substd(loff_t *p, loff_t f, loff_t t) { if ((*p & ~0x3f) == (f & ~0x3f)) *p = (t & ~0x3f) | (*p & 0x3f); }*/ static void hpfs_pos_ins(loff_t *p, loff_t d, loff_t c) { if ((*p & ~0x3f) == (d & ~0x3f) && (*p & 0x3f) >= (d & 0x3f)) { int n = (*p & 0x3f) + c; if (n > 0x3f) pr_err("%s(): %08x + %d\n", __func__, (int)*p, (int)c >> 8); else *p = (*p & ~0x3f) | n; } } static void hpfs_pos_del(loff_t *p, loff_t d, loff_t c) { if ((*p & ~0x3f) == (d & ~0x3f) && (*p & 0x3f) >= (d & 0x3f)) { int n = (*p & 0x3f) - c; if (n < 1) pr_err("%s(): %08x - %d\n", __func__, (int)*p, (int)c >> 8); else *p = (*p & ~0x3f) | n; } } static struct hpfs_dirent *dnode_pre_last_de(struct dnode *d) { struct hpfs_dirent *de, *de_end, *dee = NULL, *deee = NULL; de_end = dnode_end_de(d); for (de = dnode_first_de(d); de < de_end; de = de_next_de(de)) { deee = dee; dee = de; } return deee; } static struct hpfs_dirent *dnode_last_de(struct dnode *d) { struct hpfs_dirent *de, *de_end, *dee = NULL; de_end = dnode_end_de(d); for (de = dnode_first_de(d); de < de_end; de = de_next_de(de)) { dee = de; } return dee; } static void set_last_pointer(struct super_block *s, struct dnode *d, dnode_secno ptr) { struct hpfs_dirent *de; if (!(de = dnode_last_de(d))) { hpfs_error(s, "set_last_pointer: empty dnode %08x", le32_to_cpu(d->self)); return; } if (hpfs_sb(s)->sb_chk) { if (de->down) { hpfs_error(s, "set_last_pointer: dnode %08x has already last pointer %08x", le32_to_cpu(d->self), de_down_pointer(de)); return; } if (le16_to_cpu(de->length) != 32) { hpfs_error(s, "set_last_pointer: bad last dirent in dnode %08x", le32_to_cpu(d->self)); return; } } if (ptr) { le32_add_cpu(&d->first_free, 4); if (le32_to_cpu(d->first_free) > 2048) { hpfs_error(s, "set_last_pointer: too long dnode %08x", le32_to_cpu(d->self)); le32_add_cpu(&d->first_free, -4); return; } de->length = cpu_to_le16(36); de->down = 1; *(__le32 *)((char *)de + 32) = cpu_to_le32(ptr); } } /* Add an entry to dnode and don't care if it grows over 2048 bytes */ struct hpfs_dirent *hpfs_add_de(struct super_block *s, struct dnode *d, const unsigned char *name, unsigned namelen, secno down_ptr) { struct hpfs_dirent *de; struct hpfs_dirent *de_end = dnode_end_de(d); unsigned d_size = de_size(namelen, down_ptr); for (de = dnode_first_de(d); de < de_end; de = de_next_de(de)) { int c = hpfs_compare_names(s, name, namelen, de->name, de->namelen, de->last); if (!c) { hpfs_error(s, "name (%c,%d) already exists in dnode %08x", *name, namelen, le32_to_cpu(d->self)); return NULL; } if (c < 0) break; } memmove((char *)de + d_size, de, (char *)de_end - (char *)de); memset(de, 0, d_size); if (down_ptr) { *(__le32 *)((char *)de + d_size - 4) = cpu_to_le32(down_ptr); de->down = 1; } de->length = cpu_to_le16(d_size); de->not_8x3 = hpfs_is_name_long(name, namelen); de->namelen = namelen; memcpy(de->name, name, namelen); le32_add_cpu(&d->first_free, d_size); return de; } /* Delete dirent and don't care about its subtree */ static void hpfs_delete_de(struct super_block *s, struct dnode *d, struct hpfs_dirent *de) { if (de->last) { hpfs_error(s, "attempt to delete last dirent in dnode %08x", le32_to_cpu(d->self)); return; } d->first_free = cpu_to_le32(le32_to_cpu(d->first_free) - le16_to_cpu(de->length)); memmove(de, de_next_de(de), le32_to_cpu(d->first_free) + (char *)d - (char *)de); } static void fix_up_ptrs(struct super_block *s, struct dnode *d) { struct hpfs_dirent *de; struct hpfs_dirent *de_end = dnode_end_de(d); dnode_secno dno = le32_to_cpu(d->self); for (de = dnode_first_de(d); de < de_end; de = de_next_de(de)) if (de->down) { struct quad_buffer_head qbh; struct dnode *dd; if ((dd = hpfs_map_dnode(s, de_down_pointer(de), &qbh))) { if (le32_to_cpu(dd->up) != dno || dd->root_dnode) { dd->up = cpu_to_le32(dno); dd->root_dnode = 0; hpfs_mark_4buffers_dirty(&qbh); } hpfs_brelse4(&qbh); } } } /* Add an entry to dnode and do dnode splitting if required */ static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno, const unsigned char *name, unsigned namelen, struct hpfs_dirent *new_de, dnode_secno down_ptr) { struct quad_buffer_head qbh, qbh1, qbh2; struct dnode *d, *ad, *rd, *nd = NULL; dnode_secno adno, rdno; struct hpfs_dirent *de; struct hpfs_dirent nde; unsigned char *nname; int h; int pos; struct buffer_head *bh; struct fnode *fnode; int c1, c2 = 0; if (!(nname = kmalloc(256, GFP_NOFS))) { pr_err("out of memory, can't add to dnode\n"); return 1; } go_up: if (namelen >= 256) { hpfs_error(i->i_sb, "%s(): namelen == %d", __func__, namelen); kfree(nd); kfree(nname); return 1; } if (!(d = hpfs_map_dnode(i->i_sb, dno, &qbh))) { kfree(nd); kfree(nname); return 1; } go_up_a: if (hpfs_sb(i->i_sb)->sb_chk) if (hpfs_stop_cycles(i->i_sb, dno, &c1, &c2, "hpfs_add_to_dnode")) { hpfs_brelse4(&qbh); kfree(nd); kfree(nname); return 1; } if (le32_to_cpu(d->first_free) + de_size(namelen, down_ptr) <= 2048) { loff_t t; copy_de(de=hpfs_add_de(i->i_sb, d, name, namelen, down_ptr), new_de); t = get_pos(d, de); for_all_poss(i, hpfs_pos_ins, t, 1); for_all_poss(i, hpfs_pos_subst, 4, t); for_all_poss(i, hpfs_pos_subst, 5, t + 1); hpfs_mark_4buffers_dirty(&qbh); hpfs_brelse4(&qbh); kfree(nd); kfree(nname); return 0; } if (!nd) if (!(nd = kmalloc(0x924, GFP_NOFS))) { /* 0x924 is a max size of dnode after adding a dirent with max name length. We alloc this only once. There must not be any error while splitting dnodes, otherwise the whole directory, not only file we're adding, would be lost. */ pr_err("out of memory for dnode splitting\n"); hpfs_brelse4(&qbh); kfree(nname); return 1; } memcpy(nd, d, le32_to_cpu(d->first_free)); copy_de(de = hpfs_add_de(i->i_sb, nd, name, namelen, down_ptr), new_de); for_all_poss(i, hpfs_pos_ins, get_pos(nd, de), 1); h = ((char *)dnode_last_de(nd) - (char *)nd) / 2 + 10; if (!(ad = hpfs_alloc_dnode(i->i_sb, le32_to_cpu(d->up), &adno, &qbh1))) { hpfs_error(i->i_sb, "unable to alloc dnode - dnode tree will be corrupted"); hpfs_brelse4(&qbh); kfree(nd); kfree(nname); return 1; } i->i_size += 2048; i->i_blocks += 4; pos = 1; for (de = dnode_first_de(nd); (char *)de_next_de(de) - (char *)nd < h; de = de_next_de(de)) { copy_de(hpfs_add_de(i->i_sb, ad, de->name, de->namelen, de->down ? de_down_pointer(de) : 0), de); for_all_poss(i, hpfs_pos_subst, ((loff_t)dno << 4) | pos, ((loff_t)adno << 4) | pos); pos++; } copy_de(new_de = &nde, de); memcpy(nname, de->name, de->namelen); name = nname; namelen = de->namelen; for_all_poss(i, hpfs_pos_subst, ((loff_t)dno << 4) | pos, 4); down_ptr = adno; set_last_pointer(i->i_sb, ad, de->down ? de_down_pointer(de) : 0); de = de_next_de(de); memmove((char *)nd + 20, de, le32_to_cpu(nd->first_free) + (char *)nd - (char *)de); le32_add_cpu(&nd->first_free, -((char *)de - (char *)nd - 20)); memcpy(d, nd, le32_to_cpu(nd->first_free)); for_all_poss(i, hpfs_pos_del, (loff_t)dno << 4, pos); fix_up_ptrs(i->i_sb, ad); if (!d->root_dnode) { ad->up = d->up; dno = le32_to_cpu(ad->up); hpfs_mark_4buffers_dirty(&qbh); hpfs_brelse4(&qbh); hpfs_mark_4buffers_dirty(&qbh1); hpfs_brelse4(&qbh1); goto go_up; } if (!(rd = hpfs_alloc_dnode(i->i_sb, le32_to_cpu(d->up), &rdno, &qbh2))) { hpfs_error(i->i_sb, "unable to alloc dnode - dnode tree will be corrupted"); hpfs_brelse4(&qbh); hpfs_brelse4(&qbh1); kfree(nd); kfree(nname); return 1; } i->i_size += 2048; i->i_blocks += 4; rd->root_dnode = 1; rd->up = d->up; if (!(fnode = hpfs_map_fnode(i->i_sb, le32_to_cpu(d->up), &bh))) { hpfs_free_dnode(i->i_sb, rdno); hpfs_brelse4(&qbh); hpfs_brelse4(&qbh1); hpfs_brelse4(&qbh2); kfree(nd); kfree(nname); return 1; } fnode->u.external[0].disk_secno = cpu_to_le32(rdno); mark_buffer_dirty(bh); brelse(bh); hpfs_i(i)->i_dno = rdno; d->up = ad->up = cpu_to_le32(rdno); d->root_dnode = ad->root_dnode = 0; hpfs_mark_4buffers_dirty(&qbh); hpfs_brelse4(&qbh); hpfs_mark_4buffers_dirty(&qbh1); hpfs_brelse4(&qbh1); qbh = qbh2; set_last_pointer(i->i_sb, rd, dno); dno = rdno; d = rd; goto go_up_a; } /* * Add an entry to directory btree. * I hate such crazy directory structure. * It's easy to read but terrible to write. * I wrote this directory code 4 times. * I hope, now it's finally bug-free. */ int hpfs_add_dirent(struct inode *i, const unsigned char *name, unsigned namelen, struct hpfs_dirent *new_de) { struct hpfs_inode_info *hpfs_inode = hpfs_i(i); struct dnode *d; struct hpfs_dirent *de, *de_end; struct quad_buffer_head qbh; dnode_secno dno; int c; int c1, c2 = 0; dno = hpfs_inode->i_dno; down: if (hpfs_sb(i->i_sb)->sb_chk) if (hpfs_stop_cycles(i->i_sb, dno, &c1, &c2, "hpfs_add_dirent")) return 1; if (!(d = hpfs_map_dnode(i->i_sb, dno, &qbh))) return 1; de_end = dnode_end_de(d); for (de = dnode_first_de(d); de < de_end; de = de_next_de(de)) { if (!(c = hpfs_compare_names(i->i_sb, name, namelen, de->name, de->namelen, de->last))) { hpfs_brelse4(&qbh); return -1; } if (c < 0) { if (de->down) { dno = de_down_pointer(de); hpfs_brelse4(&qbh); goto down; } break; } } hpfs_brelse4(&qbh); if (hpfs_check_free_dnodes(i->i_sb, FREE_DNODES_ADD)) { c = 1; goto ret; } c = hpfs_add_to_dnode(i, dno, name, namelen, new_de, 0); ret: return c; } /* * Find dirent with higher name in 'from' subtree and move it to 'to' dnode. * Return the dnode we moved from (to be checked later if it's empty) */ static secno move_to_top(struct inode *i, dnode_secno from, dnode_secno to) { dnode_secno dno, ddno; dnode_secno chk_up = to; struct dnode *dnode; struct quad_buffer_head qbh; struct hpfs_dirent *de, *nde; int a; loff_t t; int c1, c2 = 0; dno = from; while (1) { if (hpfs_sb(i->i_sb)->sb_chk) if (hpfs_stop_cycles(i->i_sb, dno, &c1, &c2, "move_to_top")) return 0; if (!(dnode = hpfs_map_dnode(i->i_sb, dno, &qbh))) return 0; if (hpfs_sb(i->i_sb)->sb_chk) { if (le32_to_cpu(dnode->up) != chk_up) { hpfs_error(i->i_sb, "move_to_top: up pointer from %08x should be %08x, is %08x", dno, chk_up, le32_to_cpu(dnode->up)); hpfs_brelse4(&qbh); return 0; } chk_up = dno; } if (!(de = dnode_last_de(dnode))) { hpfs_error(i->i_sb, "move_to_top: dnode %08x has no last de", dno); hpfs_brelse4(&qbh); return 0; } if (!de->down) break; dno = de_down_pointer(de); hpfs_brelse4(&qbh); } while (!(de = dnode_pre_last_de(dnode))) { dnode_secno up = le32_to_cpu(dnode->up); hpfs_brelse4(&qbh); hpfs_free_dnode(i->i_sb, dno); i->i_size -= 2048; i->i_blocks -= 4; for_all_poss(i, hpfs_pos_subst, ((loff_t)dno << 4) | 1, 5); if (up == to) return to; if (!(dnode = hpfs_map_dnode(i->i_sb, up, &qbh))) return 0; if (dnode->root_dnode) { hpfs_error(i->i_sb, "move_to_top: got to root_dnode while moving from %08x to %08x", from, to); hpfs_brelse4(&qbh); return 0; } de = dnode_last_de(dnode); if (!de || !de->down) { hpfs_error(i->i_sb, "move_to_top: dnode %08x doesn't point down to %08x", up, dno); hpfs_brelse4(&qbh); return 0; } le32_add_cpu(&dnode->first_free, -4); le16_add_cpu(&de->length, -4); de->down = 0; hpfs_mark_4buffers_dirty(&qbh); dno = up; } t = get_pos(dnode, de); for_all_poss(i, hpfs_pos_subst, t, 4); for_all_poss(i, hpfs_pos_subst, t + 1, 5); if (!(nde = kmalloc(le16_to_cpu(de->length), GFP_NOFS))) { hpfs_error(i->i_sb, "out of memory for dirent - directory will be corrupted"); hpfs_brelse4(&qbh); return 0; } memcpy(nde, de, le16_to_cpu(de->length)); ddno = de->down ? de_down_pointer(de) : 0; hpfs_delete_de(i->i_sb, dnode, de); set_last_pointer(i->i_sb, dnode, ddno); hpfs_mark_4buffers_dirty(&qbh); hpfs_brelse4(&qbh); a = hpfs_add_to_dnode(i, to, nde->name, nde->namelen, nde, from); kfree(nde); if (a) return 0; return dno; } /* * Check if a dnode is empty and delete it from the tree * (chkdsk doesn't like empty dnodes) */ static void delete_empty_dnode(struct inode *i, dnode_secno dno) { struct hpfs_inode_info *hpfs_inode = hpfs_i(i); struct quad_buffer_head qbh; struct dnode *dnode; dnode_secno down, up, ndown; int p; struct hpfs_dirent *de; int c1, c2 = 0; try_it_again: if (hpfs_stop_cycles(i->i_sb, dno, &c1, &c2, "delete_empty_dnode")) return; if (!(dnode = hpfs_map_dnode(i->i_sb, dno, &qbh))) return; if (le32_to_cpu(dnode->first_free) > 56) goto end; if (le32_to_cpu(dnode->first_free) == 52 || le32_to_cpu(dnode->first_free) == 56) { struct hpfs_dirent *de_end; int root = dnode->root_dnode; up = le32_to_cpu(dnode->up); de = dnode_first_de(dnode); down = de->down ? de_down_pointer(de) : 0; if (hpfs_sb(i->i_sb)->sb_chk) if (root && !down) { hpfs_error(i->i_sb, "delete_empty_dnode: root dnode %08x is empty", dno); goto end; } hpfs_brelse4(&qbh); hpfs_free_dnode(i->i_sb, dno); i->i_size -= 2048; i->i_blocks -= 4; if (root) { struct fnode *fnode; struct buffer_head *bh; struct dnode *d1; struct quad_buffer_head qbh1; if (hpfs_sb(i->i_sb)->sb_chk) if (up != i->i_ino) { hpfs_error(i->i_sb, "bad pointer to fnode, dnode %08x, pointing to %08x, should be %08lx", dno, up, (unsigned long)i->i_ino); return; } if ((d1 = hpfs_map_dnode(i->i_sb, down, &qbh1))) { d1->up = cpu_to_le32(up); d1->root_dnode = 1; hpfs_mark_4buffers_dirty(&qbh1); hpfs_brelse4(&qbh1); } if ((fnode = hpfs_map_fnode(i->i_sb, up, &bh))) { fnode->u.external[0].disk_secno = cpu_to_le32(down); mark_buffer_dirty(bh); brelse(bh); } hpfs_inode->i_dno = down; for_all_poss(i, hpfs_pos_subst, ((loff_t)dno << 4) | 1, (loff_t) 12); return; } if (!(dnode = hpfs_map_dnode(i->i_sb, up, &qbh))) return; p = 1; de_end = dnode_end_de(dnode); for (de = dnode_first_de(dnode); de < de_end; de = de_next_de(de), p++) if (de->down) if (de_down_pointer(de) == dno) goto fnd; hpfs_error(i->i_sb, "delete_empty_dnode: pointer to dnode %08x not found in dnode %08x", dno, up); goto end; fnd: for_all_poss(i, hpfs_pos_subst, ((loff_t)dno << 4) | 1, ((loff_t)up << 4) | p); if (!down) { de->down = 0; le16_add_cpu(&de->length, -4); le32_add_cpu(&dnode->first_free, -4); memmove(de_next_de(de), (char *)de_next_de(de) + 4, (char *)dnode + le32_to_cpu(dnode->first_free) - (char *)de_next_de(de)); } else { struct dnode *d1; struct quad_buffer_head qbh1; *(dnode_secno *) ((void *) de + le16_to_cpu(de->length) - 4) = down; if ((d1 = hpfs_map_dnode(i->i_sb, down, &qbh1))) { d1->up = cpu_to_le32(up); hpfs_mark_4buffers_dirty(&qbh1); hpfs_brelse4(&qbh1); } } } else { hpfs_error(i->i_sb, "delete_empty_dnode: dnode %08x, first_free == %03x", dno, le32_to_cpu(dnode->first_free)); goto end; } if (!de->last) { struct hpfs_dirent *de_next = de_next_de(de); struct hpfs_dirent *de_cp; struct dnode *d1; struct quad_buffer_head qbh1; if (!de_next->down) goto endm; ndown = de_down_pointer(de_next); if (!(de_cp = kmalloc(le16_to_cpu(de->length), GFP_NOFS))) { pr_err("out of memory for dtree balancing\n"); goto endm; } memcpy(de_cp, de, le16_to_cpu(de->length)); hpfs_delete_de(i->i_sb, dnode, de); hpfs_mark_4buffers_dirty(&qbh); hpfs_brelse4(&qbh); for_all_poss(i, hpfs_pos_subst, ((loff_t)up << 4) | p, 4); for_all_poss(i, hpfs_pos_del, ((loff_t)up << 4) | p, 1); if (de_cp->down) if ((d1 = hpfs_map_dnode(i->i_sb, de_down_pointer(de_cp), &qbh1))) { d1->up = cpu_to_le32(ndown); hpfs_mark_4buffers_dirty(&qbh1); hpfs_brelse4(&qbh1); } hpfs_add_to_dnode(i, ndown, de_cp->name, de_cp->namelen, de_cp, de_cp->down ? de_down_pointer(de_cp) : 0); /*pr_info("UP-TO-DNODE: %08x (ndown = %08x, down = %08x, dno = %08x)\n", up, ndown, down, dno);*/ dno = up; kfree(de_cp); goto try_it_again; } else { struct hpfs_dirent *de_prev = dnode_pre_last_de(dnode); struct hpfs_dirent *de_cp; struct dnode *d1; struct quad_buffer_head qbh1; dnode_secno dlp; if (!de_prev) { hpfs_error(i->i_sb, "delete_empty_dnode: empty dnode %08x", up); hpfs_mark_4buffers_dirty(&qbh); hpfs_brelse4(&qbh); dno = up; goto try_it_again; } if (!de_prev->down) goto endm; ndown = de_down_pointer(de_prev); if ((d1 = hpfs_map_dnode(i->i_sb, ndown, &qbh1))) { struct hpfs_dirent *del = dnode_last_de(d1); dlp = del->down ? de_down_pointer(del) : 0; if (!dlp && down) { if (le32_to_cpu(d1->first_free) > 2044) { if (hpfs_sb(i->i_sb)->sb_chk >= 2) { pr_err("unbalanced dnode tree, see hpfs.txt 4 more info\n"); pr_err("terminating balancing operation\n"); } hpfs_brelse4(&qbh1); goto endm; } if (hpfs_sb(i->i_sb)->sb_chk >= 2) { pr_err("unbalanced dnode tree, see hpfs.txt 4 more info\n"); pr_err("goin'on\n"); } le16_add_cpu(&del->length, 4); del->down = 1; le32_add_cpu(&d1->first_free, 4); } if (dlp && !down) { le16_add_cpu(&del->length, -4); del->down = 0; le32_add_cpu(&d1->first_free, -4); } else if (down) *(__le32 *) ((void *) del + le16_to_cpu(del->length) - 4) = cpu_to_le32(down); } else goto endm; if (!(de_cp = kmalloc(le16_to_cpu(de_prev->length), GFP_NOFS))) { pr_err("out of memory for dtree balancing\n"); hpfs_brelse4(&qbh1); goto endm; } hpfs_mark_4buffers_dirty(&qbh1); hpfs_brelse4(&qbh1); memcpy(de_cp, de_prev, le16_to_cpu(de_prev->length)); hpfs_delete_de(i->i_sb, dnode, de_prev); if (!de_prev->down) { le16_add_cpu(&de_prev->length, 4); de_prev->down = 1; le32_add_cpu(&dnode->first_free, 4); } *(__le32 *) ((void *) de_prev + le16_to_cpu(de_prev->length) - 4) = cpu_to_le32(ndown); hpfs_mark_4buffers_dirty(&qbh); hpfs_brelse4(&qbh); for_all_poss(i, hpfs_pos_subst, ((loff_t)up << 4) | (p - 1), 4); for_all_poss(i, hpfs_pos_subst, ((loff_t)up << 4) | p, ((loff_t)up << 4) | (p - 1)); if (down) if ((d1 = hpfs_map_dnode(i->i_sb, de_down_pointer(de), &qbh1))) { d1->up = cpu_to_le32(ndown); hpfs_mark_4buffers_dirty(&qbh1); hpfs_brelse4(&qbh1); } hpfs_add_to_dnode(i, ndown, de_cp->name, de_cp->namelen, de_cp, dlp); dno = up; kfree(de_cp); goto try_it_again; } endm: hpfs_mark_4buffers_dirty(&qbh); end: hpfs_brelse4(&qbh); } /* Delete dirent from directory */ int hpfs_remove_dirent(struct inode *i, dnode_secno dno, struct hpfs_dirent *de, struct quad_buffer_head *qbh, int depth) { struct dnode *dnode = qbh->data; dnode_secno down = 0; loff_t t; if (de->first || de->last) { hpfs_error(i->i_sb, "hpfs_remove_dirent: attempt to delete first or last dirent in dnode %08x", dno); hpfs_brelse4(qbh); return 1; } if (de->down) down = de_down_pointer(de); if (depth && (de->down || (de == dnode_first_de(dnode) && de_next_de(de)->last))) { if (hpfs_check_free_dnodes(i->i_sb, FREE_DNODES_DEL)) { hpfs_brelse4(qbh); return 2; } } for_all_poss(i, hpfs_pos_del, (t = get_pos(dnode, de)) + 1, 1); hpfs_delete_de(i->i_sb, dnode, de); hpfs_mark_4buffers_dirty(qbh); hpfs_brelse4(qbh); if (down) { dnode_secno a = move_to_top(i, down, dno); for_all_poss(i, hpfs_pos_subst, 5, t); if (a) delete_empty_dnode(i, a); return !a; } delete_empty_dnode(i, dno); return 0; } void hpfs_count_dnodes(struct super_block *s, dnode_secno dno, int *n_dnodes, int *n_subdirs, int *n_items) { struct dnode *dnode; struct quad_buffer_head qbh; struct hpfs_dirent *de; dnode_secno ptr, odno = 0; int c1, c2 = 0; int d1, d2 = 0; go_down: if (n_dnodes) (*n_dnodes)++; if (hpfs_sb(s)->sb_chk) if (hpfs_stop_cycles(s, dno, &c1, &c2, "hpfs_count_dnodes #1")) return; ptr = 0; go_up: if (!(dnode = hpfs_map_dnode(s, dno, &qbh))) return; if (hpfs_sb(s)->sb_chk) if (odno && odno != -1 && le32_to_cpu(dnode->up) != odno) hpfs_error(s, "hpfs_count_dnodes: bad up pointer; dnode %08x, down %08x points to %08x", odno, dno, le32_to_cpu(dnode->up)); de = dnode_first_de(dnode); if (ptr) while(1) { if (de->down) if (de_down_pointer(de) == ptr) goto process_de; if (de->last) { hpfs_brelse4(&qbh); hpfs_error(s, "hpfs_count_dnodes: pointer to dnode %08x not found in dnode %08x, got here from %08x", ptr, dno, odno); return; } de = de_next_de(de); } next_de: if (de->down) { odno = dno; dno = de_down_pointer(de); hpfs_brelse4(&qbh); goto go_down; } process_de: if (!de->first && !de->last && de->directory && n_subdirs) (*n_subdirs)++; if (!de->first && !de->last && n_items) (*n_items)++; if ((de = de_next_de(de)) < dnode_end_de(dnode)) goto next_de; ptr = dno; dno = le32_to_cpu(dnode->up); if (dnode->root_dnode) { hpfs_brelse4(&qbh); return; } hpfs_brelse4(&qbh); if (hpfs_sb(s)->sb_chk) if (hpfs_stop_cycles(s, ptr, &d1, &d2, "hpfs_count_dnodes #2")) return; odno = -1; goto go_up; } static struct hpfs_dirent *map_nth_dirent(struct super_block *s, dnode_secno dno, int n, struct quad_buffer_head *qbh, struct dnode **dn) { int i; struct hpfs_dirent *de, *de_end; struct dnode *dnode; dnode = hpfs_map_dnode(s, dno, qbh); if (!dnode) return NULL; if (dn) *dn=dnode; de = dnode_first_de(dnode); de_end = dnode_end_de(dnode); for (i = 1; de < de_end; i++, de = de_next_de(de)) { if (i == n) { return de; } if (de->last) break; } hpfs_brelse4(qbh); hpfs_error(s, "map_nth_dirent: n too high; dnode = %08x, requested %08x", dno, n); return NULL; } dnode_secno hpfs_de_as_down_as_possible(struct super_block *s, dnode_secno dno) { struct quad_buffer_head qbh; dnode_secno d = dno; dnode_secno up = 0; struct hpfs_dirent *de; int c1, c2 = 0; again: if (hpfs_sb(s)->sb_chk) if (hpfs_stop_cycles(s, d, &c1, &c2, "hpfs_de_as_down_as_possible")) return d; if (!(de = map_nth_dirent(s, d, 1, &qbh, NULL))) return dno; if (hpfs_sb(s)->sb_chk) if (up && le32_to_cpu(((struct dnode *)qbh.data)->up) != up) hpfs_error(s, "hpfs_de_as_down_as_possible: bad up pointer; dnode %08x, down %08x points to %08x", up, d, le32_to_cpu(((struct dnode *)qbh.data)->up)); if (!de->down) { hpfs_brelse4(&qbh); return d; } up = d; d = de_down_pointer(de); hpfs_brelse4(&qbh); goto again; } struct hpfs_dirent *map_pos_dirent(struct inode *inode, loff_t *posp, struct quad_buffer_head *qbh) { loff_t pos; unsigned c; dnode_secno dno; struct hpfs_dirent *de, *d; struct hpfs_dirent *up_de; struct hpfs_dirent *end_up_de; struct dnode *dnode; struct dnode *up_dnode; struct quad_buffer_head qbh0; pos = *posp; dno = pos >> 6 << 2; pos &= 077; if (!(de = map_nth_dirent(inode->i_sb, dno, pos, qbh, &dnode))) goto bail; /* Going to the next dirent */ if ((d = de_next_de(de)) < dnode_end_de(dnode)) { if (!(++*posp & 077)) { hpfs_error(inode->i_sb, "map_pos_dirent: pos crossed dnode boundary; pos = %08llx", (unsigned long long)*posp); goto bail; } /* We're going down the tree */ if (d->down) { *posp = ((loff_t) hpfs_de_as_down_as_possible(inode->i_sb, de_down_pointer(d)) << 4) + 1; } return de; } /* Going up */ if (dnode->root_dnode) goto bail; if (!(up_dnode = hpfs_map_dnode(inode->i_sb, le32_to_cpu(dnode->up), &qbh0))) goto bail; end_up_de = dnode_end_de(up_dnode); c = 0; for (up_de = dnode_first_de(up_dnode); up_de < end_up_de; up_de = de_next_de(up_de)) { if (!(++c & 077)) hpfs_error(inode->i_sb, "map_pos_dirent: pos crossed dnode boundary; dnode = %08x", le32_to_cpu(dnode->up)); if (up_de->down && de_down_pointer(up_de) == dno) { *posp = ((loff_t) le32_to_cpu(dnode->up) << 4) + c; hpfs_brelse4(&qbh0); return de; } } hpfs_error(inode->i_sb, "map_pos_dirent: pointer to dnode %08x not found in parent dnode %08x", dno, le32_to_cpu(dnode->up)); hpfs_brelse4(&qbh0); bail: *posp = 12; return de; } /* Find a dirent in tree */ struct hpfs_dirent *map_dirent(struct inode *inode, dnode_secno dno, const unsigned char *name, unsigned len, dnode_secno *dd, struct quad_buffer_head *qbh) { struct dnode *dnode; struct hpfs_dirent *de; struct hpfs_dirent *de_end; int c1, c2 = 0; if (!S_ISDIR(inode->i_mode)) hpfs_error(inode->i_sb, "map_dirent: not a directory\n"); again: if (hpfs_sb(inode->i_sb)->sb_chk) if (hpfs_stop_cycles(inode->i_sb, dno, &c1, &c2, "map_dirent")) return NULL; if (!(dnode = hpfs_map_dnode(inode->i_sb, dno, qbh))) return NULL; de_end = dnode_end_de(dnode); for (de = dnode_first_de(dnode); de < de_end; de = de_next_de(de)) { int t = hpfs_compare_names(inode->i_sb, name, len, de->name, de->namelen, de->last); if (!t) { if (dd) *dd = dno; return de; } if (t < 0) { if (de->down) { dno = de_down_pointer(de); hpfs_brelse4(qbh); goto again; } break; } } hpfs_brelse4(qbh); return NULL; } /* * Remove empty directory. In normal cases it is only one dnode with two * entries, but we must handle also such obscure cases when it's a tree * of empty dnodes. */ void hpfs_remove_dtree(struct super_block *s, dnode_secno dno) { struct quad_buffer_head qbh; struct dnode *dnode; struct hpfs_dirent *de; dnode_secno d1, d2, rdno = dno; while (1) { if (!(dnode = hpfs_map_dnode(s, dno, &qbh))) return; de = dnode_first_de(dnode); if (de->last) { if (de->down) d1 = de_down_pointer(de); else goto error; hpfs_brelse4(&qbh); hpfs_free_dnode(s, dno); dno = d1; } else break; } if (!de->first) goto error; d1 = de->down ? de_down_pointer(de) : 0; de = de_next_de(de); if (!de->last) goto error; d2 = de->down ? de_down_pointer(de) : 0; hpfs_brelse4(&qbh); hpfs_free_dnode(s, dno); do { while (d1) { if (!(dnode = hpfs_map_dnode(s, dno = d1, &qbh))) return; de = dnode_first_de(dnode); if (!de->last) goto error; d1 = de->down ? de_down_pointer(de) : 0; hpfs_brelse4(&qbh); hpfs_free_dnode(s, dno); } d1 = d2; d2 = 0; } while (d1); return; error: hpfs_brelse4(&qbh); hpfs_free_dnode(s, dno); hpfs_error(s, "directory %08x is corrupted or not empty", rdno); } /* * Find dirent for specified fnode. Use truncated 15-char name in fnode as * a help for searching. */ struct hpfs_dirent *map_fnode_dirent(struct super_block *s, fnode_secno fno, struct fnode *f, struct quad_buffer_head *qbh) { unsigned char *name1; unsigned char *name2; int name1len, name2len; struct dnode *d; dnode_secno dno, downd; struct fnode *upf; struct buffer_head *bh; struct hpfs_dirent *de, *de_end; int c; int c1, c2 = 0; int d1, d2 = 0; name1 = f->name; if (!(name2 = kmalloc(256, GFP_NOFS))) { pr_err("out of memory, can't map dirent\n"); return NULL; } if (f->len <= 15) memcpy(name2, name1, name1len = name2len = f->len); else { memcpy(name2, name1, 15); memset(name2 + 15, 0xff, 256 - 15); /*name2[15] = 0xff;*/ name1len = 15; name2len = 256; } if (!(upf = hpfs_map_fnode(s, le32_to_cpu(f->up), &bh))) { kfree(name2); return NULL; } if (!fnode_is_dir(upf)) { brelse(bh); hpfs_error(s, "fnode %08x has non-directory parent %08x", fno, le32_to_cpu(f->up)); kfree(name2); return NULL; } dno = le32_to_cpu(upf->u.external[0].disk_secno); brelse(bh); go_down: downd = 0; go_up: if (!(d = hpfs_map_dnode(s, dno, qbh))) { kfree(name2); return NULL; } de_end = dnode_end_de(d); de = dnode_first_de(d); if (downd) { while (de < de_end) { if (de->down) if (de_down_pointer(de) == downd) goto f; de = de_next_de(de); } hpfs_error(s, "pointer to dnode %08x not found in dnode %08x", downd, dno); hpfs_brelse4(qbh); kfree(name2); return NULL; } next_de: if (le32_to_cpu(de->fnode) == fno) { kfree(name2); return de; } c = hpfs_compare_names(s, name1, name1len, de->name, de->namelen, de->last); if (c < 0 && de->down) { dno = de_down_pointer(de); hpfs_brelse4(qbh); if (hpfs_sb(s)->sb_chk) if (hpfs_stop_cycles(s, dno, &c1, &c2, "map_fnode_dirent #1")) { kfree(name2); return NULL; } goto go_down; } f: if (le32_to_cpu(de->fnode) == fno) { kfree(name2); return de; } c = hpfs_compare_names(s, name2, name2len, de->name, de->namelen, de->last); if (c < 0 && !de->last) goto not_found; if ((de = de_next_de(de)) < de_end) goto next_de; if (d->root_dnode) goto not_found; downd = dno; dno = le32_to_cpu(d->up); hpfs_brelse4(qbh); if (hpfs_sb(s)->sb_chk) if (hpfs_stop_cycles(s, downd, &d1, &d2, "map_fnode_dirent #2")) { kfree(name2); return NULL; } goto go_up; not_found: hpfs_brelse4(qbh); hpfs_error(s, "dirent for fnode %08x not found", fno); kfree(name2); return NULL; }
8 20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000-2006 Silicon Graphics, Inc. * All Rights Reserved. */ #ifndef __XFS_BMAP_H__ #define __XFS_BMAP_H__ struct getbmap; struct xfs_bmbt_irec; struct xfs_ifork; struct xfs_inode; struct xfs_mount; struct xfs_trans; struct xfs_alloc_arg; /* * Argument structure for xfs_bmap_alloc. */ struct xfs_bmalloca { struct xfs_trans *tp; /* transaction pointer */ struct xfs_inode *ip; /* incore inode pointer */ struct xfs_bmbt_irec prev; /* extent before the new one */ struct xfs_bmbt_irec got; /* extent after, or delayed */ xfs_fileoff_t offset; /* offset in file filling in */ xfs_extlen_t length; /* i/o length asked/allocated */ xfs_fsblock_t blkno; /* starting block of new extent */ struct xfs_btree_cur *cur; /* btree cursor */ struct xfs_iext_cursor icur; /* incore extent cursor */ int nallocs;/* number of extents alloc'd */ int logflags;/* flags for transaction logging */ xfs_extlen_t total; /* total blocks needed for xaction */ xfs_extlen_t minlen; /* minimum allocation size (blocks) */ xfs_extlen_t minleft; /* amount must be left after alloc */ bool eof; /* set if allocating past last extent */ bool wasdel; /* replacing a delayed allocation */ bool aeof; /* allocated space at eof */ bool conv; /* overwriting unwritten extents */ int datatype;/* data type being allocated */ uint32_t flags; }; #define XFS_BMAP_MAX_NMAP 4 /* * Flags for xfs_bmapi_* */ #define XFS_BMAPI_ENTIRE (1u << 0) /* return entire extent untrimmed */ #define XFS_BMAPI_METADATA (1u << 1) /* mapping metadata not user data */ #define XFS_BMAPI_ATTRFORK (1u << 2) /* use attribute fork not data */ #define XFS_BMAPI_PREALLOC (1u << 3) /* preallocating unwritten space */ #define XFS_BMAPI_CONTIG (1u << 4) /* must allocate only one extent */ /* * unwritten extent conversion - this needs write cache flushing and no additional * allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts * from written to unwritten, otherwise convert from unwritten to written. */ #define XFS_BMAPI_CONVERT (1u << 5) /* * allocate zeroed extents - this requires all newly allocated user data extents * to be initialised to zero. It will be ignored if XFS_BMAPI_METADATA is set. * Use in conjunction with XFS_BMAPI_CONVERT to convert unwritten extents found * during the allocation range to zeroed written extents. */ #define XFS_BMAPI_ZERO (1u << 6) /* * Map the inode offset to the block given in ap->firstblock. Primarily * used for reflink. The range must be in a hole, and this flag cannot be * turned on with PREALLOC or CONVERT, and cannot be used on the attr fork. * * For bunmapi, this flag unmaps the range without adjusting quota, reducing * refcount, or freeing the blocks. */ #define XFS_BMAPI_REMAP (1u << 7) /* Map something in the CoW fork. */ #define XFS_BMAPI_COWFORK (1u << 8) /* Skip online discard of freed extents */ #define XFS_BMAPI_NODISCARD (1u << 9) /* Do not update the rmap btree. Used for reconstructing bmbt from rmapbt. */ #define XFS_BMAPI_NORMAP (1u << 10) /* Try to align allocations to the extent size hint */ #define XFS_BMAPI_EXTSZALIGN (1u << 11) #define XFS_BMAPI_FLAGS \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ { XFS_BMAPI_METADATA, "METADATA" }, \ { XFS_BMAPI_ATTRFORK, "ATTRFORK" }, \ { XFS_BMAPI_PREALLOC, "PREALLOC" }, \ { XFS_BMAPI_CONTIG, "CONTIG" }, \ { XFS_BMAPI_CONVERT, "CONVERT" }, \ { XFS_BMAPI_ZERO, "ZERO" }, \ { XFS_BMAPI_REMAP, "REMAP" }, \ { XFS_BMAPI_COWFORK, "COWFORK" }, \ { XFS_BMAPI_NODISCARD, "NODISCARD" }, \ { XFS_BMAPI_NORMAP, "NORMAP" },\ { XFS_BMAPI_EXTSZALIGN, "EXTSZALIGN" } static inline int xfs_bmapi_aflag(int w) { return (w == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK : (w == XFS_COW_FORK ? XFS_BMAPI_COWFORK : 0)); } static inline int xfs_bmapi_whichfork(uint32_t bmapi_flags) { if (bmapi_flags & XFS_BMAPI_COWFORK) return XFS_COW_FORK; else if (bmapi_flags & XFS_BMAPI_ATTRFORK) return XFS_ATTR_FORK; return XFS_DATA_FORK; } void xfs_bmap_alloc_account(struct xfs_bmalloca *ap); /* * Special values for xfs_bmbt_irec_t br_startblock field. */ #define DELAYSTARTBLOCK ((xfs_fsblock_t)-1LL) #define HOLESTARTBLOCK ((xfs_fsblock_t)-2LL) /* * Flags for xfs_bmap_add_extent*. */ #define BMAP_LEFT_CONTIG (1u << 0) #define BMAP_RIGHT_CONTIG (1u << 1) #define BMAP_LEFT_FILLING (1u << 2) #define BMAP_RIGHT_FILLING (1u << 3) #define BMAP_LEFT_DELAY (1u << 4) #define BMAP_RIGHT_DELAY (1u << 5) #define BMAP_LEFT_VALID (1u << 6) #define BMAP_RIGHT_VALID (1u << 7) #define BMAP_ATTRFORK (1u << 8) #define BMAP_COWFORK (1u << 9) #define XFS_BMAP_EXT_FLAGS \ { BMAP_LEFT_CONTIG, "LC" }, \ { BMAP_RIGHT_CONTIG, "RC" }, \ { BMAP_LEFT_FILLING, "LF" }, \ { BMAP_RIGHT_FILLING, "RF" }, \ { BMAP_ATTRFORK, "ATTR" }, \ { BMAP_COWFORK, "COW" } /* Return true if the extent is an allocated extent, written or not. */ static inline bool xfs_bmap_is_real_extent(const struct xfs_bmbt_irec *irec) { return irec->br_startblock != HOLESTARTBLOCK && irec->br_startblock != DELAYSTARTBLOCK && !isnullstartblock(irec->br_startblock); } /* * Return true if the extent is a real, allocated extent, or false if it is a * delayed allocation, and unwritten extent or a hole. */ static inline bool xfs_bmap_is_written_extent(const struct xfs_bmbt_irec *irec) { return xfs_bmap_is_real_extent(irec) && irec->br_state != XFS_EXT_UNWRITTEN; } /* * Check the mapping for obviously garbage allocations that could trash the * filesystem immediately. */ #define xfs_valid_startblock(ip, startblock) \ ((startblock) != 0 || XFS_IS_REALTIME_INODE(ip)) int xfs_bmap_longest_free_extent(struct xfs_perag *pag, struct xfs_trans *tp, xfs_extlen_t *blen); void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno, xfs_filblks_t len); unsigned int xfs_bmap_compute_attr_offset(struct xfs_mount *mp); int xfs_bmap_add_attrfork(struct xfs_trans *tp, struct xfs_inode *ip, int size, int rsvd); void xfs_bmap_local_to_extents_empty(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork); int xfs_bmap_local_to_extents(struct xfs_trans *tp, struct xfs_inode *ip, xfs_extlen_t total, int *logflagsp, int whichfork, void (*init_fn)(struct xfs_trans *tp, struct xfs_buf *bp, struct xfs_inode *ip, struct xfs_ifork *ifp, void *priv), void *priv); void xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork); int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip, xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork); int xfs_bmap_last_before(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t *last_block, int whichfork); int xfs_bmap_last_offset(struct xfs_inode *ip, xfs_fileoff_t *unused, int whichfork); int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, struct xfs_bmbt_irec *mval, int *nmap, uint32_t flags); int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags, xfs_extlen_t total, struct xfs_bmbt_irec *mval, int *nmap); int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags, xfs_extnum_t nexts, int *done); void xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork, struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *del, uint32_t bflags); void xfs_bmap_del_extent_cow(struct xfs_inode *ip, struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *del); uint xfs_default_attroffset(struct xfs_inode *ip); int xfs_bmap_collapse_extents(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb, bool *done); int xfs_bmap_can_insert_extents(struct xfs_inode *ip, xfs_fileoff_t off, xfs_fileoff_t shift); int xfs_bmap_insert_extents(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb, bool *done, xfs_fileoff_t stop_fsb); int xfs_bmap_split_extent(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t split_offset); int xfs_bmapi_convert_delalloc(struct xfs_inode *ip, int whichfork, xfs_off_t offset, struct iomap *iomap, unsigned int *seq); int xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, struct xfs_iext_cursor *icur, struct xfs_btree_cur **curp, struct xfs_bmbt_irec *new, int *logflagsp); xfs_extlen_t xfs_bmapi_minleft(struct xfs_trans *tp, struct xfs_inode *ip, int fork); int xfs_bmap_btalloc_low_space(struct xfs_bmalloca *ap, struct xfs_alloc_arg *args); xfs_filblks_t xfs_bmap_worst_indlen(struct xfs_inode *ip, xfs_filblks_t len); enum xfs_bmap_intent_type { XFS_BMAP_MAP = 1, XFS_BMAP_UNMAP, }; #define XFS_BMAP_INTENT_STRINGS \ { XFS_BMAP_MAP, "map" }, \ { XFS_BMAP_UNMAP, "unmap" } struct xfs_bmap_intent { struct list_head bi_list; enum xfs_bmap_intent_type bi_type; int bi_whichfork; struct xfs_inode *bi_owner; struct xfs_group *bi_group; struct xfs_bmbt_irec bi_bmap; }; int xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_bmap_intent *bi); void xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *imap); void xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *imap); static inline uint32_t xfs_bmap_fork_to_state(int whichfork) { switch (whichfork) { case XFS_ATTR_FORK: return BMAP_ATTRFORK; case XFS_COW_FORK: return BMAP_COWFORK; default: return 0; } } xfs_failaddr_t xfs_bmap_validate_extent_raw(struct xfs_mount *mp, bool rtfile, int whichfork, struct xfs_bmbt_irec *irec); xfs_failaddr_t xfs_bmap_validate_extent(struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *irec); int xfs_bmap_complain_bad_rec(struct xfs_inode *ip, int whichfork, xfs_failaddr_t fa, const struct xfs_bmbt_irec *irec); int xfs_bmapi_remap(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, xfs_fsblock_t startblock, uint32_t flags); int xfs_bunmapi_range(struct xfs_trans **tpp, struct xfs_inode *ip, uint32_t flags, xfs_fileoff_t startoff, xfs_fileoff_t endoff); extern struct kmem_cache *xfs_bmap_intent_cache; int __init xfs_bmap_intent_init_cache(void); void xfs_bmap_intent_destroy_cache(void); typedef int (*xfs_bmap_query_range_fn)( struct xfs_btree_cur *cur, struct xfs_bmbt_irec *rec, void *priv); int xfs_bmap_query_all(struct xfs_btree_cur *cur, xfs_bmap_query_range_fn fn, void *priv); xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); xfs_extlen_t xfs_get_cowextsz_hint(struct xfs_inode *ip); #endif /* __XFS_BMAP_H__ */
100 99 100 99 27 79 83 39 50 20 67 83 4 37 26 36 77 20 39 6 68 30 2 41 1 2 6 5 3 2 3 2 33 33 2 20 10 4 23 12 42 31 50 10 25 9 2 33 59 83 100 44 37 58 58 100 100 99 58 1 58 59 7 100 108 107 106 75 3 3 100 100 100 9 99 100 100 7 100 100 3 3 59 58 100 108 1 6 1 100 102 100 99 2 98 115 35 83 14 99 115 9 5 8 8 1 79 125 125 1 127 127 125 5 127 79 51 5 126 79 51 91 1 81 2 93 126 30 114 5 124 9 6 113 8 7 126 2 41 7 7 7 7 7 7 7 1 7 1 7 1 1 7 7 1 1 1 1 9 9 5 5 5 10 10 6 11 11 12 11 42 42 12 32 12 32 47 42 7 47 12 2 5 2 42 47 13 13 12 10 1 1 3 7 1 9 2 2 2 2 2 5 7 3 22 3 1 22 22 30 1 3 5 11 10 1 10 1 1 3 5 1 4 1 7 1 3 3 2 7 9 10 46 1 13 15 20 8 7 19 11 8 22 9 22 22 1 22 23 20 2 22 2 21 24 6 3 3 2 2 12 5 12 10 5 1 2 1 1 3 1 7 4 10 2 5 3 3 7 3 1 1 4 2 5 2 1 3 2 2 4 2 8 13 2 12 9 109 107 3 62 41 23 192 147 52 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. */ #include <linux/fs.h> #include <linux/pagemap.h> #include <linux/time.h> #include <linux/init.h> #include <linux/string.h> #include <linux/backing-dev.h> #include <linux/falloc.h> #include <linux/writeback.h> #include <linux/compat.h> #include <linux/slab.h> #include <linux/btrfs.h> #include <linux/uio.h> #include <linux/iversion.h> #include <linux/fsverity.h> #include "ctree.h" #include "direct-io.h" #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" #include "tree-log.h" #include "locking.h" #include "qgroup.h" #include "compression.h" #include "delalloc-space.h" #include "reflink.h" #include "subpage.h" #include "fs.h" #include "accessors.h" #include "extent-tree.h" #include "file-item.h" #include "ioctl.h" #include "file.h" #include "super.h" #include "print-tree.h" /* * Unlock folio after btrfs_file_write() is done with it. */ static void btrfs_drop_folio(struct btrfs_fs_info *fs_info, struct folio *folio, u64 pos, u64 copied) { u64 block_start = round_down(pos, fs_info->sectorsize); u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start; ASSERT(block_len <= U32_MAX); /* * Folio checked is some magic around finding folios that have been * modified without going through btrfs_dirty_folio(). Clear it here. * There should be no need to mark the pages accessed as * prepare_one_folio() should have marked them accessed in * prepare_one_folio() via find_or_create_page() */ btrfs_folio_clamp_clear_checked(fs_info, folio, block_start, block_len); folio_unlock(folio); folio_put(folio); } /* * After copy_folio_from_iter_atomic(), update the following things for delalloc: * - Mark newly dirtied folio as DELALLOC in the io tree. * Used to advise which range is to be written back. * - Mark modified folio as Uptodate/Dirty and not needing COW fixup * - Update inode size for past EOF write */ int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos, size_t write_bytes, struct extent_state **cached, bool noreserve) { struct btrfs_fs_info *fs_info = inode->root->fs_info; int ret = 0; u64 num_bytes; u64 start_pos; u64 end_of_last_block; u64 end_pos = pos + write_bytes; loff_t isize = i_size_read(&inode->vfs_inode); unsigned int extra_bits = 0; if (write_bytes == 0) return 0; if (noreserve) extra_bits |= EXTENT_NORESERVE; start_pos = round_down(pos, fs_info->sectorsize); num_bytes = round_up(write_bytes + pos - start_pos, fs_info->sectorsize); ASSERT(num_bytes <= U32_MAX); ASSERT(folio_pos(folio) <= pos && folio_end(folio) >= pos + write_bytes); end_of_last_block = start_pos + num_bytes - 1; /* * The pages may have already been dirty, clear out old accounting so * we can set things up properly */ btrfs_clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, cached); ret = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, extra_bits, cached); if (ret) return ret; btrfs_folio_clamp_set_uptodate(fs_info, folio, start_pos, num_bytes); btrfs_folio_clamp_clear_checked(fs_info, folio, start_pos, num_bytes); btrfs_folio_clamp_set_dirty(fs_info, folio, start_pos, num_bytes); /* * we've only changed i_size in ram, and we haven't updated * the disk i_size. There is no need to log the inode * at this time. */ if (end_pos > isize) i_size_write(&inode->vfs_inode, end_pos); return 0; } /* * this is very complex, but the basic idea is to drop all extents * in the range start - end. hint_block is filled in with a block number * that would be a good hint to the block allocator for this file. * * If an extent intersects the range but is not entirely inside the range * it is either truncated or split. Anything entirely inside the range * is deleted from the tree. * * Note: the VFS' inode number of bytes is not updated, it's up to the caller * to deal with that. We set the field 'bytes_found' of the arguments structure * with the number of allocated bytes found in the target range, so that the * caller can update the inode's number of bytes in an atomic way when * replacing extents in a range to avoid races with stat(2). */ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, struct btrfs_drop_extents_args *args) { struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *leaf; struct btrfs_file_extent_item *fi; struct btrfs_key key; struct btrfs_key new_key; u64 ino = btrfs_ino(inode); u64 search_start = args->start; u64 disk_bytenr = 0; u64 num_bytes = 0; u64 extent_offset = 0; u64 extent_end = 0; u64 last_end = args->start; int del_nr = 0; int del_slot = 0; int extent_type; int recow; int ret; int modify_tree = -1; int update_refs; int found = 0; struct btrfs_path *path = args->path; args->bytes_found = 0; args->extent_inserted = false; /* Must always have a path if ->replace_extent is true */ ASSERT(!(args->replace_extent && !args->path)); if (!path) { path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out; } } if (args->drop_cache) btrfs_drop_extent_map_range(inode, args->start, args->end - 1, false); if (data_race(args->start >= inode->disk_i_size) && !args->replace_extent) modify_tree = 0; update_refs = (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID); while (1) { recow = 0; ret = btrfs_lookup_file_extent(trans, root, path, ino, search_start, modify_tree); if (ret < 0) break; if (ret > 0 && path->slots[0] > 0 && search_start == args->start) { leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1); if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) path->slots[0]--; } ret = 0; next_slot: leaf = path->nodes[0]; if (path->slots[0] >= btrfs_header_nritems(leaf)) { if (WARN_ON(del_nr > 0)) { btrfs_print_leaf(leaf); ret = -EINVAL; break; } ret = btrfs_next_leaf(root, path); if (ret < 0) break; if (ret > 0) { ret = 0; break; } leaf = path->nodes[0]; recow = 1; } btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.objectid > ino) break; if (WARN_ON_ONCE(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY) { ASSERT(del_nr == 0); path->slots[0]++; goto next_slot; } if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= args->end) break; fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); extent_type = btrfs_file_extent_type(leaf, fi); if (extent_type == BTRFS_FILE_EXTENT_REG || extent_type == BTRFS_FILE_EXTENT_PREALLOC) { disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); extent_offset = btrfs_file_extent_offset(leaf, fi); extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { extent_end = key.offset + btrfs_file_extent_ram_bytes(leaf, fi); } else { /* can't happen */ BUG(); } /* * Don't skip extent items representing 0 byte lengths. They * used to be created (bug) if while punching holes we hit * -ENOSPC condition. So if we find one here, just ensure we * delete it, otherwise we would insert a new file extent item * with the same key (offset) as that 0 bytes length file * extent item in the call to setup_items_for_insert() later * in this function. */ if (extent_end == key.offset && extent_end >= search_start) { last_end = extent_end; goto delete_extent_item; } if (extent_end <= search_start) { path->slots[0]++; goto next_slot; } found = 1; search_start = max(key.offset, args->start); if (recow || !modify_tree) { modify_tree = -1; btrfs_release_path(path); continue; } /* * | - range to drop - | * | -------- extent -------- | */ if (args->start > key.offset && args->end < extent_end) { if (WARN_ON(del_nr > 0)) { btrfs_print_leaf(leaf); ret = -EINVAL; break; } if (extent_type == BTRFS_FILE_EXTENT_INLINE) { ret = -EOPNOTSUPP; break; } memcpy(&new_key, &key, sizeof(new_key)); new_key.offset = args->start; ret = btrfs_duplicate_item(trans, root, path, &new_key); if (ret == -EAGAIN) { btrfs_release_path(path); continue; } if (ret < 0) break; leaf = path->nodes[0]; fi = btrfs_item_ptr(leaf, path->slots[0] - 1, struct btrfs_file_extent_item); btrfs_set_file_extent_num_bytes(leaf, fi, args->start - key.offset); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); extent_offset += args->start - key.offset; btrfs_set_file_extent_offset(leaf, fi, extent_offset); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - args->start); if (update_refs && disk_bytenr > 0) { struct btrfs_ref ref = { .action = BTRFS_ADD_DELAYED_REF, .bytenr = disk_bytenr, .num_bytes = num_bytes, .parent = 0, .owning_root = btrfs_root_id(root), .ref_root = btrfs_root_id(root), }; btrfs_init_data_ref(&ref, new_key.objectid, args->start - extent_offset, 0, false); ret = btrfs_inc_extent_ref(trans, &ref); if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } } key.offset = args->start; } /* * From here on out we will have actually dropped something, so * last_end can be updated. */ last_end = extent_end; /* * | ---- range to drop ----- | * | -------- extent -------- | */ if (args->start <= key.offset && args->end < extent_end) { if (extent_type == BTRFS_FILE_EXTENT_INLINE) { ret = -EOPNOTSUPP; break; } memcpy(&new_key, &key, sizeof(new_key)); new_key.offset = args->end; btrfs_set_item_key_safe(trans, path, &new_key); extent_offset += args->end - key.offset; btrfs_set_file_extent_offset(leaf, fi, extent_offset); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - args->end); if (update_refs && disk_bytenr > 0) args->bytes_found += args->end - key.offset; break; } search_start = extent_end; /* * | ---- range to drop ----- | * | -------- extent -------- | */ if (args->start > key.offset && args->end >= extent_end) { if (WARN_ON(del_nr > 0)) { btrfs_print_leaf(leaf); ret = -EINVAL; break; } if (extent_type == BTRFS_FILE_EXTENT_INLINE) { ret = -EOPNOTSUPP; break; } btrfs_set_file_extent_num_bytes(leaf, fi, args->start - key.offset); if (update_refs && disk_bytenr > 0) args->bytes_found += extent_end - args->start; if (args->end == extent_end) break; path->slots[0]++; goto next_slot; } /* * | ---- range to drop ----- | * | ------ extent ------ | */ if (args->start <= key.offset && args->end >= extent_end) { delete_extent_item: if (del_nr == 0) { del_slot = path->slots[0]; del_nr = 1; } else { if (WARN_ON(del_slot + del_nr != path->slots[0])) { btrfs_print_leaf(leaf); ret = -EINVAL; break; } del_nr++; } if (update_refs && extent_type == BTRFS_FILE_EXTENT_INLINE) { args->bytes_found += extent_end - key.offset; extent_end = ALIGN(extent_end, fs_info->sectorsize); } else if (update_refs && disk_bytenr > 0) { struct btrfs_ref ref = { .action = BTRFS_DROP_DELAYED_REF, .bytenr = disk_bytenr, .num_bytes = num_bytes, .parent = 0, .owning_root = btrfs_root_id(root), .ref_root = btrfs_root_id(root), }; btrfs_init_data_ref(&ref, key.objectid, key.offset - extent_offset, 0, false); ret = btrfs_free_extent(trans, &ref); if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } args->bytes_found += extent_end - key.offset; } if (args->end == extent_end) break; if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) { path->slots[0]++; goto next_slot; } ret = btrfs_del_items(trans, root, path, del_slot, del_nr); if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } del_nr = 0; del_slot = 0; btrfs_release_path(path); continue; } BUG(); } if (!ret && del_nr > 0) { /* * Set path->slots[0] to first slot, so that after the delete * if items are move off from our leaf to its immediate left or * right neighbor leafs, we end up with a correct and adjusted * path->slots[0] for our insertion (if args->replace_extent). */ path->slots[0] = del_slot; ret = btrfs_del_items(trans, root, path, del_slot, del_nr); if (ret) btrfs_abort_transaction(trans, ret); } leaf = path->nodes[0]; /* * If btrfs_del_items() was called, it might have deleted a leaf, in * which case it unlocked our path, so check path->locks[0] matches a * write lock. */ if (!ret && args->replace_extent && path->locks[0] == BTRFS_WRITE_LOCK && btrfs_leaf_free_space(leaf) >= sizeof(struct btrfs_item) + args->extent_item_size) { key.objectid = ino; key.type = BTRFS_EXTENT_DATA_KEY; key.offset = args->start; if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) { struct btrfs_key slot_key; btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]); if (btrfs_comp_cpu_keys(&key, &slot_key) > 0) path->slots[0]++; } btrfs_setup_item_for_insert(trans, root, path, &key, args->extent_item_size); args->extent_inserted = true; } if (!args->path) btrfs_free_path(path); else if (!args->extent_inserted) btrfs_release_path(path); out: args->drop_end = found ? min(args->end, last_end) : args->end; return ret; } static bool extent_mergeable(struct extent_buffer *leaf, int slot, u64 objectid, u64 bytenr, u64 orig_offset, u64 *start, u64 *end) { struct btrfs_file_extent_item *fi; struct btrfs_key key; u64 extent_end; if (slot < 0 || slot >= btrfs_header_nritems(leaf)) return false; btrfs_item_key_to_cpu(leaf, &key, slot); if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY) return false; fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG || btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr || btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset || btrfs_file_extent_compression(leaf, fi) || btrfs_file_extent_encryption(leaf, fi) || btrfs_file_extent_other_encoding(leaf, fi)) return false; extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); if ((*start && *start != key.offset) || (*end && *end != extent_end)) return false; *start = key.offset; *end = extent_end; return true; } /* * Mark extent in the range start - end as written. * * This changes extent type from 'pre-allocated' to 'regular'. If only * part of extent is marked as written, the extent will be split into * two or three. */ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, u64 start, u64 end) { struct btrfs_root *root = inode->root; struct extent_buffer *leaf; BTRFS_PATH_AUTO_FREE(path); struct btrfs_file_extent_item *fi; struct btrfs_ref ref = { 0 }; struct btrfs_key key; struct btrfs_key new_key; u64 bytenr; u64 num_bytes; u64 extent_end; u64 orig_offset; u64 other_start; u64 other_end; u64 split; int del_nr = 0; int del_slot = 0; int recow; int ret = 0; u64 ino = btrfs_ino(inode); path = btrfs_alloc_path(); if (!path) return -ENOMEM; again: recow = 0; split = start; key.objectid = ino; key.type = BTRFS_EXTENT_DATA_KEY; key.offset = split; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) goto out; if (ret > 0 && path->slots[0] > 0) path->slots[0]--; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (unlikely(key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)) { ret = -EINVAL; btrfs_abort_transaction(trans, ret); goto out; } fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); if (unlikely(btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC)) { ret = -EINVAL; btrfs_abort_transaction(trans, ret); goto out; } extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); if (unlikely(key.offset > start || extent_end < end)) { ret = -EINVAL; btrfs_abort_transaction(trans, ret); goto out; } bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi); memcpy(&new_key, &key, sizeof(new_key)); if (start == key.offset && end < extent_end) { other_start = 0; other_end = start; if (extent_mergeable(leaf, path->slots[0] - 1, ino, bytenr, orig_offset, &other_start, &other_end)) { new_key.offset = end; btrfs_set_item_key_safe(trans, path, &new_key); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); btrfs_set_file_extent_generation(leaf, fi, trans->transid); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - end); btrfs_set_file_extent_offset(leaf, fi, end - orig_offset); fi = btrfs_item_ptr(leaf, path->slots[0] - 1, struct btrfs_file_extent_item); btrfs_set_file_extent_generation(leaf, fi, trans->transid); btrfs_set_file_extent_num_bytes(leaf, fi, end - other_start); goto out; } } if (start > key.offset && end == extent_end) { other_start = end; other_end = 0; if (extent_mergeable(leaf, path->slots[0] + 1, ino, bytenr, orig_offset, &other_start, &other_end)) { fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); btrfs_set_file_extent_num_bytes(leaf, fi, start - key.offset); btrfs_set_file_extent_generation(leaf, fi, trans->transid); path->slots[0]++; new_key.offset = start; btrfs_set_item_key_safe(trans, path, &new_key); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); btrfs_set_file_extent_generation(leaf, fi, trans->transid); btrfs_set_file_extent_num_bytes(leaf, fi, other_end - start); btrfs_set_file_extent_offset(leaf, fi, start - orig_offset); goto out; } } while (start > key.offset || end < extent_end) { if (key.offset == start) split = end; new_key.offset = split; ret = btrfs_duplicate_item(trans, root, path, &new_key); if (ret == -EAGAIN) { btrfs_release_path(path); goto again; } if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out; } leaf = path->nodes[0]; fi = btrfs_item_ptr(leaf, path->slots[0] - 1, struct btrfs_file_extent_item); btrfs_set_file_extent_generation(leaf, fi, trans->transid); btrfs_set_file_extent_num_bytes(leaf, fi, split - key.offset); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); btrfs_set_file_extent_generation(leaf, fi, trans->transid); btrfs_set_file_extent_offset(leaf, fi, split - orig_offset); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split); ref.action = BTRFS_ADD_DELAYED_REF; ref.bytenr = bytenr; ref.num_bytes = num_bytes; ref.parent = 0; ref.owning_root = btrfs_root_id(root); ref.ref_root = btrfs_root_id(root); btrfs_init_data_ref(&ref, ino, orig_offset, 0, false); ret = btrfs_inc_extent_ref(trans, &ref); if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } if (split == start) { key.offset = start; } else { if (unlikely(start != key.offset)) { ret = -EINVAL; btrfs_abort_transaction(trans, ret); goto out; } path->slots[0]--; extent_end = end; } recow = 1; } other_start = end; other_end = 0; ref.action = BTRFS_DROP_DELAYED_REF; ref.bytenr = bytenr; ref.num_bytes = num_bytes; ref.parent = 0; ref.owning_root = btrfs_root_id(root); ref.ref_root = btrfs_root_id(root); btrfs_init_data_ref(&ref, ino, orig_offset, 0, false); if (extent_mergeable(leaf, path->slots[0] + 1, ino, bytenr, orig_offset, &other_start, &other_end)) { if (recow) { btrfs_release_path(path); goto again; } extent_end = other_end; del_slot = path->slots[0] + 1; del_nr++; ret = btrfs_free_extent(trans, &ref); if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } } other_start = 0; other_end = start; if (extent_mergeable(leaf, path->slots[0] - 1, ino, bytenr, orig_offset, &other_start, &other_end)) { if (recow) { btrfs_release_path(path); goto again; } key.offset = other_start; del_slot = path->slots[0]; del_nr++; ret = btrfs_free_extent(trans, &ref); if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } } if (del_nr == 0) { fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG); btrfs_set_file_extent_generation(leaf, fi, trans->transid); } else { fi = btrfs_item_ptr(leaf, del_slot - 1, struct btrfs_file_extent_item); btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG); btrfs_set_file_extent_generation(leaf, fi, trans->transid); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset); ret = btrfs_del_items(trans, root, path, del_slot, del_nr); if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out; } } out: return ret; } /* * On error return an unlocked folio and the error value * On success return a locked folio and 0 */ static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64 pos, u64 len) { u64 clamp_start = max_t(u64, pos, folio_pos(folio)); u64 clamp_end = min_t(u64, pos + len, folio_end(folio)); const u32 blocksize = inode_to_fs_info(inode)->sectorsize; int ret = 0; if (folio_test_uptodate(folio)) return 0; if (IS_ALIGNED(clamp_start, blocksize) && IS_ALIGNED(clamp_end, blocksize)) return 0; ret = btrfs_read_folio(NULL, folio); if (ret) return ret; folio_lock(folio); if (unlikely(!folio_test_uptodate(folio))) { folio_unlock(folio); return -EIO; } /* * Since btrfs_read_folio() will unlock the folio before it returns, * there is a window where btrfs_release_folio() can be called to * release the page. Here we check both inode mapping and page * private to make sure the page was not released. * * The private flag check is essential for subpage as we need to store * extra bitmap using folio private. */ if (folio->mapping != inode->i_mapping || !folio_test_private(folio)) { folio_unlock(folio); return -EAGAIN; } return 0; } static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait) { gfp_t gfp; gfp = btrfs_alloc_write_mask(inode->i_mapping); if (nowait) { gfp &= ~__GFP_DIRECT_RECLAIM; gfp |= GFP_NOWAIT; } return gfp; } /* * Get folio into the page cache and lock it. */ static noinline int prepare_one_folio(struct inode *inode, struct folio **folio_ret, loff_t pos, size_t write_bytes, bool nowait) { const pgoff_t index = pos >> PAGE_SHIFT; gfp_t mask = get_prepare_gfp_flags(inode, nowait); fgf_t fgp_flags = (nowait ? FGP_WRITEBEGIN | FGP_NOWAIT : FGP_WRITEBEGIN) | fgf_set_order(write_bytes); struct folio *folio; int ret = 0; again: folio = __filemap_get_folio(inode->i_mapping, index, fgp_flags, mask); if (IS_ERR(folio)) return PTR_ERR(folio); ret = set_folio_extent_mapped(folio); if (ret < 0) { folio_unlock(folio); folio_put(folio); return ret; } ret = prepare_uptodate_folio(inode, folio, pos, write_bytes); if (ret) { /* The folio is already unlocked. */ folio_put(folio); if (!nowait && ret == -EAGAIN) { ret = 0; goto again; } return ret; } *folio_ret = folio; return 0; } /* * Locks the extent and properly waits for data=ordered extents to finish * before allowing the folios to be modified if need. * * Return: * 1 - the extent is locked * 0 - the extent is not locked, and everything is OK * -EAGAIN - need to prepare the folios again */ static noinline int lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio, loff_t pos, size_t write_bytes, u64 *lockstart, u64 *lockend, bool nowait, struct extent_state **cached_state) { struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 start_pos; u64 last_pos; int ret = 0; start_pos = round_down(pos, fs_info->sectorsize); last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - 1; if (start_pos < inode->vfs_inode.i_size) { struct btrfs_ordered_extent *ordered; if (nowait) { if (!btrfs_try_lock_extent(&inode->io_tree, start_pos, last_pos, cached_state)) { folio_unlock(folio); folio_put(folio); return -EAGAIN; } } else { btrfs_lock_extent(&inode->io_tree, start_pos, last_pos, cached_state); } ordered = btrfs_lookup_ordered_range(inode, start_pos, last_pos - start_pos + 1); if (ordered && ordered->file_offset + ordered->num_bytes > start_pos && ordered->file_offset <= last_pos) { btrfs_unlock_extent(&inode->io_tree, start_pos, last_pos, cached_state); folio_unlock(folio); folio_put(folio); btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); return -EAGAIN; } if (ordered) btrfs_put_ordered_extent(ordered); *lockstart = start_pos; *lockend = last_pos; ret = 1; } /* * We should be called after prepare_one_folio() which should have locked * all pages in the range. */ WARN_ON(!folio_test_locked(folio)); return ret; } /* * Check if we can do nocow write into the range [@pos, @pos + @write_bytes) * * @pos: File offset. * @write_bytes: The length to write, will be updated to the nocow writeable * range. * @nowait: Indicate if we can block or not (non-blocking IO context). * * This function will flush ordered extents in the range to ensure proper * nocow checks. * * Return: * > 0 If we can nocow, and updates @write_bytes. * 0 If we can't do a nocow write. * -EAGAIN If we can't do a nocow write because snapshotting of the inode's * root is in progress or because we are in a non-blocking IO * context and need to block (@nowait is true). * < 0 If an error happened. * * NOTE: Callers need to call btrfs_check_nocow_unlock() if we return > 0. */ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, size_t *write_bytes, bool nowait) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_root *root = inode->root; struct extent_state *cached_state = NULL; u64 lockstart, lockend; u64 cur_offset; int ret = 0; if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) return 0; if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) return -EAGAIN; lockstart = round_down(pos, fs_info->sectorsize); lockend = round_up(pos + *write_bytes, fs_info->sectorsize) - 1; if (nowait) { if (!btrfs_try_lock_ordered_range(inode, lockstart, lockend, &cached_state)) { btrfs_drew_write_unlock(&root->snapshot_lock); return -EAGAIN; } } else { btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend, &cached_state); } cur_offset = lockstart; while (cur_offset < lockend) { u64 num_bytes = lockend - cur_offset + 1; ret = can_nocow_extent(inode, cur_offset, &num_bytes, NULL, nowait); if (ret <= 0) { /* * If cur_offset == lockstart it means we haven't found * any extent against which we can NOCOW, so unlock the * snapshot lock. */ if (cur_offset == lockstart) btrfs_drew_write_unlock(&root->snapshot_lock); break; } cur_offset += num_bytes; } btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); /* * cur_offset > lockstart means there's at least a partial range we can * NOCOW, and that range can cover one or more extents. */ if (cur_offset > lockstart) { *write_bytes = min_t(size_t, *write_bytes, cur_offset - pos); return 1; } return ret; } void btrfs_check_nocow_unlock(struct btrfs_inode *inode) { btrfs_drew_write_unlock(&inode->root->snapshot_lock); } int btrfs_write_check(struct kiocb *iocb, size_t count) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); loff_t pos = iocb->ki_pos; int ret; loff_t oldsize; /* * Quickly bail out on NOWAIT writes if we don't have the nodatacow or * prealloc flags, as without those flags we always have to COW. We will * later check if we can really COW into the target range (using * can_nocow_extent() at btrfs_get_blocks_direct_write()). */ if ((iocb->ki_flags & IOCB_NOWAIT) && !(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) return -EAGAIN; ret = file_remove_privs(file); if (ret) return ret; /* * We reserve space for updating the inode when we reserve space for the * extent we are going to write, so we will enospc out there. We don't * need to start yet another transaction to update the inode as we will * update the inode when we finish writing whatever data we write. */ if (!IS_NOCMTIME(inode)) { inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); inode_inc_iversion(inode); } oldsize = i_size_read(inode); if (pos > oldsize) { /* Expand hole size to cover write data, preventing empty gap */ loff_t end_pos = round_up(pos + count, fs_info->sectorsize); ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, end_pos); if (ret) return ret; } return 0; } static void release_space(struct btrfs_inode *inode, struct extent_changeset *data_reserved, u64 start, u64 len, bool only_release_metadata) { if (len == 0) return; if (only_release_metadata) { btrfs_check_nocow_unlock(inode); btrfs_delalloc_release_metadata(inode, len, true); } else { const struct btrfs_fs_info *fs_info = inode->root->fs_info; btrfs_delalloc_release_space(inode, data_reserved, round_down(start, fs_info->sectorsize), len, true); } } /* * Reserve data and metadata space for this buffered write range. * * Return >0 for the number of bytes reserved, which is always block aligned. * Return <0 for error. */ static ssize_t reserve_space(struct btrfs_inode *inode, struct extent_changeset **data_reserved, u64 start, size_t *len, bool nowait, bool *only_release_metadata) { const struct btrfs_fs_info *fs_info = inode->root->fs_info; const unsigned int block_offset = (start & (fs_info->sectorsize - 1)); size_t reserve_bytes; int ret; ret = btrfs_check_data_free_space(inode, data_reserved, start, *len, nowait); if (ret < 0) { int can_nocow; if (nowait && (ret == -ENOSPC || ret == -EAGAIN)) return -EAGAIN; /* * If we don't have to COW at the offset, reserve metadata only. * write_bytes may get smaller than requested here. */ can_nocow = btrfs_check_nocow_lock(inode, start, len, nowait); if (can_nocow < 0) ret = can_nocow; if (can_nocow > 0) ret = 0; if (ret) return ret; *only_release_metadata = true; } reserve_bytes = round_up(*len + block_offset, fs_info->sectorsize); WARN_ON(reserve_bytes == 0); ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes, reserve_bytes, nowait); if (ret) { if (!*only_release_metadata) btrfs_free_reserved_data_space(inode, *data_reserved, start, *len); else btrfs_check_nocow_unlock(inode); if (nowait && ret == -ENOSPC) ret = -EAGAIN; return ret; } return reserve_bytes; } /* Shrink the reserved data and metadata space from @reserved_len to @new_len. */ static void shrink_reserved_space(struct btrfs_inode *inode, struct extent_changeset *data_reserved, u64 reserved_start, u64 reserved_len, u64 new_len, bool only_release_metadata) { const u64 diff = reserved_len - new_len; ASSERT(new_len <= reserved_len); btrfs_delalloc_shrink_extents(inode, reserved_len, new_len); if (only_release_metadata) btrfs_delalloc_release_metadata(inode, diff, true); else btrfs_delalloc_release_space(inode, data_reserved, reserved_start + new_len, diff, true); } /* Calculate the maximum amount of bytes we can write into one folio. */ static size_t calc_write_bytes(const struct btrfs_inode *inode, const struct iov_iter *iter, u64 start) { const size_t max_folio_size = mapping_max_folio_size(inode->vfs_inode.i_mapping); return min(max_folio_size - (start & (max_folio_size - 1)), iov_iter_count(iter)); } /* * Do the heavy-lifting work to copy one range into one folio of the page cache. * * Return > 0 in case we copied all bytes or just some of them. * Return 0 if no bytes were copied, in which case the caller should retry. * Return <0 on error. */ static int copy_one_range(struct btrfs_inode *inode, struct iov_iter *iter, struct extent_changeset **data_reserved, u64 start, bool nowait) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_state *cached_state = NULL; size_t write_bytes = calc_write_bytes(inode, iter, start); size_t copied; const u64 reserved_start = round_down(start, fs_info->sectorsize); u64 reserved_len; struct folio *folio = NULL; int extents_locked; u64 lockstart; u64 lockend; bool only_release_metadata = false; const unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0); int ret; /* * Fault all pages before locking them in prepare_one_folio() to avoid * recursive lock. */ if (unlikely(fault_in_iov_iter_readable(iter, write_bytes))) return -EFAULT; extent_changeset_release(*data_reserved); ret = reserve_space(inode, data_reserved, start, &write_bytes, nowait, &only_release_metadata); if (ret < 0) return ret; reserved_len = ret; /* Write range must be inside the reserved range. */ ASSERT(reserved_start <= start); ASSERT(start + write_bytes <= reserved_start + reserved_len); again: ret = balance_dirty_pages_ratelimited_flags(inode->vfs_inode.i_mapping, bdp_flags); if (ret) { btrfs_delalloc_release_extents(inode, reserved_len); release_space(inode, *data_reserved, reserved_start, reserved_len, only_release_metadata); return ret; } ret = prepare_one_folio(&inode->vfs_inode, &folio, start, write_bytes, false); if (ret) { btrfs_delalloc_release_extents(inode, reserved_len); release_space(inode, *data_reserved, reserved_start, reserved_len, only_release_metadata); return ret; } /* * The reserved range goes beyond the current folio, shrink the reserved * space to the folio boundary. */ if (reserved_start + reserved_len > folio_end(folio)) { const u64 last_block = folio_end(folio); shrink_reserved_space(inode, *data_reserved, reserved_start, reserved_len, last_block - reserved_start, only_release_metadata); write_bytes = last_block - start; reserved_len = last_block - reserved_start; } extents_locked = lock_and_cleanup_extent_if_need(inode, folio, start, write_bytes, &lockstart, &lockend, nowait, &cached_state); if (extents_locked < 0) { if (!nowait && extents_locked == -EAGAIN) goto again; btrfs_delalloc_release_extents(inode, reserved_len); release_space(inode, *data_reserved, reserved_start, reserved_len, only_release_metadata); ret = extents_locked; return ret; } copied = copy_folio_from_iter_atomic(folio, offset_in_folio(folio, start), write_bytes, iter); flush_dcache_folio(folio); if (unlikely(copied < write_bytes)) { u64 last_block; /* * The original write range doesn't need an uptodate folio as * the range is block aligned. But now a short copy happened. * We cannot handle it without an uptodate folio. * * So just revert the range and we will retry. */ if (!folio_test_uptodate(folio)) { iov_iter_revert(iter, copied); copied = 0; } /* No copied bytes, unlock, release reserved space and exit. */ if (copied == 0) { if (extents_locked) btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); else btrfs_free_extent_state(cached_state); btrfs_delalloc_release_extents(inode, reserved_len); release_space(inode, *data_reserved, reserved_start, reserved_len, only_release_metadata); btrfs_drop_folio(fs_info, folio, start, copied); return 0; } /* Release the reserved space beyond the last block. */ last_block = round_up(start + copied, fs_info->sectorsize); shrink_reserved_space(inode, *data_reserved, reserved_start, reserved_len, last_block - reserved_start, only_release_metadata); reserved_len = last_block - reserved_start; } ret = btrfs_dirty_folio(inode, folio, start, copied, &cached_state, only_release_metadata); /* * If we have not locked the extent range, because the range's start * offset is >= i_size, we might still have a non-NULL cached extent * state, acquired while marking the extent range as delalloc through * btrfs_dirty_page(). Therefore free any possible cached extent state * to avoid a memory leak. */ if (extents_locked) btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); else btrfs_free_extent_state(cached_state); btrfs_delalloc_release_extents(inode, reserved_len); if (ret) { btrfs_drop_folio(fs_info, folio, start, copied); release_space(inode, *data_reserved, reserved_start, reserved_len, only_release_metadata); return ret; } if (only_release_metadata) btrfs_check_nocow_unlock(inode); btrfs_drop_folio(fs_info, folio, start, copied); return copied; } ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; loff_t pos; struct inode *inode = file_inode(file); struct extent_changeset *data_reserved = NULL; size_t num_written = 0; ssize_t ret; loff_t old_isize; unsigned int ilock_flags = 0; const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); if (nowait) ilock_flags |= BTRFS_ILOCK_TRY; ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags); if (ret < 0) return ret; /* * We can only trust the isize with inode lock held, or it can race with * other buffered writes and cause incorrect call of * pagecache_isize_extended() to overwrite existing data. */ old_isize = i_size_read(inode); ret = generic_write_checks(iocb, iter); if (ret <= 0) goto out; ret = btrfs_write_check(iocb, ret); if (ret < 0) goto out; pos = iocb->ki_pos; while (iov_iter_count(iter) > 0) { ret = copy_one_range(BTRFS_I(inode), iter, &data_reserved, pos, nowait); if (ret < 0) break; pos += ret; num_written += ret; cond_resched(); } extent_changeset_free(data_reserved); if (num_written > 0) { pagecache_isize_extended(inode, old_isize, iocb->ki_pos); iocb->ki_pos += num_written; } out: btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); return num_written ? num_written : ret; } static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from, const struct btrfs_ioctl_encoded_io_args *encoded) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); loff_t count; ssize_t ret; btrfs_inode_lock(BTRFS_I(inode), 0); count = encoded->len; ret = generic_write_checks_count(iocb, &count); if (ret == 0 && count != encoded->len) { /* * The write got truncated by generic_write_checks_count(). We * can't do a partial encoded write. */ ret = -EFBIG; } if (ret || encoded->len == 0) goto out; ret = btrfs_write_check(iocb, encoded->len); if (ret < 0) goto out; ret = btrfs_do_encoded_write(iocb, from, encoded); out: btrfs_inode_unlock(BTRFS_I(inode), 0); return ret; } ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, const struct btrfs_ioctl_encoded_io_args *encoded) { struct file *file = iocb->ki_filp; struct btrfs_inode *inode = BTRFS_I(file_inode(file)); ssize_t num_written, num_sync; /* * If the fs flips readonly due to some impossible error, although we * have opened a file as writable, we have to stop this write operation * to ensure consistency. */ if (BTRFS_FS_ERROR(inode->root->fs_info)) return -EROFS; if (encoded && (iocb->ki_flags & IOCB_NOWAIT)) return -EOPNOTSUPP; if (encoded) { num_written = btrfs_encoded_write(iocb, from, encoded); num_sync = encoded->len; } else if (iocb->ki_flags & IOCB_DIRECT) { num_written = btrfs_direct_write(iocb, from); num_sync = num_written; } else { num_written = btrfs_buffered_write(iocb, from); num_sync = num_written; } btrfs_set_inode_last_sub_trans(inode); if (num_sync > 0) { num_sync = generic_write_sync(iocb, num_sync); if (num_sync < 0) num_written = num_sync; } return num_written; } static ssize_t btrfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { return btrfs_do_write_iter(iocb, from, NULL); } int btrfs_release_file(struct inode *inode, struct file *filp) { struct btrfs_file_private *private = filp->private_data; if (private) { kfree(private->filldir_buf); btrfs_free_extent_state(private->llseek_cached_state); kfree(private); filp->private_data = NULL; } /* * Set by setattr when we are about to truncate a file from a non-zero * size to a zero size. This tries to flush down new bytes that may * have been written if the application were using truncate to replace * a file in place. */ if (test_and_clear_bit(BTRFS_INODE_FLUSH_ON_CLOSE, &BTRFS_I(inode)->runtime_flags)) filemap_flush(inode->i_mapping); return 0; } static int start_ordered_ops(struct btrfs_inode *inode, loff_t start, loff_t end) { int ret; struct blk_plug plug; /* * This is only called in fsync, which would do synchronous writes, so * a plug can merge adjacent IOs as much as possible. Esp. in case of * multiple disks using raid profile, a large IO can be split to * several segments of stripe length (currently 64K). */ blk_start_plug(&plug); ret = btrfs_fdatawrite_range(inode, start, end); blk_finish_plug(&plug); return ret; } static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx) { struct btrfs_inode *inode = ctx->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; if (btrfs_inode_in_log(inode, btrfs_get_fs_generation(fs_info)) && list_empty(&ctx->ordered_extents)) return true; /* * If we are doing a fast fsync we can not bail out if the inode's * last_trans is <= then the last committed transaction, because we only * update the last_trans of the inode during ordered extent completion, * and for a fast fsync we don't wait for that, we only wait for the * writeback to complete. */ if (inode->last_trans <= btrfs_get_last_trans_committed(fs_info) && (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) || list_empty(&ctx->ordered_extents))) return true; return false; } /* * fsync call for both files and directories. This logs the inode into * the tree log instead of forcing full commits whenever possible. * * It needs to call filemap_fdatawait so that all ordered extent updates are * in the metadata btree are up to date for copying to the log. * * It drops the inode mutex before doing the tree log commit. This is an * important optimization for directories because holding the mutex prevents * new operations on the dir while we write to disk. */ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { struct dentry *dentry = file_dentry(file); struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; struct btrfs_log_ctx ctx; int ret = 0, err; u64 len; bool full_sync; bool skip_ilock = false; if (current->journal_info == BTRFS_TRANS_DIO_WRITE_STUB) { skip_ilock = true; current->journal_info = NULL; btrfs_assert_inode_locked(inode); } trace_btrfs_sync_file(file, datasync); btrfs_init_log_ctx(&ctx, inode); /* * Always set the range to a full range, otherwise we can get into * several problems, from missing file extent items to represent holes * when not using the NO_HOLES feature, to log tree corruption due to * races between hole detection during logging and completion of ordered * extents outside the range, to missing checksums due to ordered extents * for which we flushed only a subset of their pages. */ start = 0; end = LLONG_MAX; len = (u64)LLONG_MAX + 1; /* * We write the dirty pages in the range and wait until they complete * out of the ->i_mutex. If so, we can flush the dirty pages by * multi-task, and make the performance up. See * btrfs_wait_ordered_range for an explanation of the ASYNC check. */ ret = start_ordered_ops(inode, start, end); if (ret) goto out; if (skip_ilock) down_write(&inode->i_mmap_lock); else btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); atomic_inc(&root->log_batch); /* * Before we acquired the inode's lock and the mmap lock, someone may * have dirtied more pages in the target range. We need to make sure * that writeback for any such pages does not start while we are logging * the inode, because if it does, any of the following might happen when * we are not doing a full inode sync: * * 1) We log an extent after its writeback finishes but before its * checksums are added to the csum tree, leading to -EIO errors * when attempting to read the extent after a log replay. * * 2) We can end up logging an extent before its writeback finishes. * Therefore after the log replay we will have a file extent item * pointing to an unwritten extent (and no data checksums as well). * * So trigger writeback for any eventual new dirty pages and then we * wait for all ordered extents to complete below. */ ret = start_ordered_ops(inode, start, end); if (ret) { if (skip_ilock) up_write(&inode->i_mmap_lock); else btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); goto out; } /* * Always check for the full sync flag while holding the inode's lock, * to avoid races with other tasks. The flag must be either set all the * time during logging or always off all the time while logging. * We check the flag here after starting delalloc above, because when * running delalloc the full sync flag may be set if we need to drop * extra extent map ranges due to temporary memory allocation failures. */ full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); /* * We have to do this here to avoid the priority inversion of waiting on * IO of a lower priority task while holding a transaction open. * * For a full fsync we wait for the ordered extents to complete while * for a fast fsync we wait just for writeback to complete, and then * attach the ordered extents to the transaction so that a transaction * commit waits for their completion, to avoid data loss if we fsync, * the current transaction commits before the ordered extents complete * and a power failure happens right after that. * * For zoned filesystem, if a write IO uses a ZONE_APPEND command, the * logical address recorded in the ordered extent may change. We need * to wait for the IO to stabilize the logical address. */ if (full_sync || btrfs_is_zoned(fs_info)) { ret = btrfs_wait_ordered_range(inode, start, len); clear_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags); } else { /* * Get our ordered extents as soon as possible to avoid doing * checksum lookups in the csum tree, and use instead the * checksums attached to the ordered extents. */ btrfs_get_ordered_extents_for_logging(inode, &ctx.ordered_extents); ret = filemap_fdatawait_range(inode->vfs_inode.i_mapping, start, end); if (ret) goto out_release_extents; /* * Check and clear the BTRFS_INODE_COW_WRITE_ERROR now after * starting and waiting for writeback, because for buffered IO * it may have been set during the end IO callback * (end_bbio_data_write() -> btrfs_finish_ordered_extent()) in * case an error happened and we need to wait for ordered * extents to complete so that any extent maps that point to * unwritten locations are dropped and we don't log them. */ if (test_and_clear_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags)) ret = btrfs_wait_ordered_range(inode, start, len); } if (ret) goto out_release_extents; atomic_inc(&root->log_batch); if (skip_inode_logging(&ctx)) { /* * We've had everything committed since the last time we were * modified so clear this flag in case it was set for whatever * reason, it's no longer relevant. */ clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); /* * An ordered extent might have started before and completed * already with io errors, in which case the inode was not * updated and we end up here. So check the inode's mapping * for any errors that might have happened since we last * checked called fsync. */ ret = filemap_check_wb_err(inode->vfs_inode.i_mapping, file->f_wb_err); goto out_release_extents; } btrfs_init_log_ctx_scratch_eb(&ctx); /* * We use start here because we will need to wait on the IO to complete * in btrfs_sync_log, which could require joining a transaction (for * example checking cross references in the nocow path). If we use join * here we could get into a situation where we're waiting on IO to * happen that is blocked on a transaction trying to commit. With start * we inc the extwriter counter, so we wait for all extwriters to exit * before we start blocking joiners. This comment is to keep somebody * from thinking they are super smart and changing this to * btrfs_join_transaction *cough*Josef*cough*. */ trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_release_extents; } trans->in_fsync = true; ret = btrfs_log_dentry_safe(trans, dentry, &ctx); /* * Scratch eb no longer needed, release before syncing log or commit * transaction, to avoid holding unnecessary memory during such long * operations. */ if (ctx.scratch_eb) { free_extent_buffer(ctx.scratch_eb); ctx.scratch_eb = NULL; } btrfs_release_log_ctx_extents(&ctx); if (ret < 0) { /* Fallthrough and commit/free transaction. */ ret = BTRFS_LOG_FORCE_COMMIT; } /* we've logged all the items and now have a consistent * version of the file in the log. It is possible that * someone will come in and modify the file, but that's * fine because the log is consistent on disk, and we * have references to all of the file's extents * * It is possible that someone will come in and log the * file again, but that will end up using the synchronization * inside btrfs_sync_log to keep things safe. */ if (skip_ilock) up_write(&inode->i_mmap_lock); else btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); if (ret == BTRFS_NO_LOG_SYNC) { ret = btrfs_end_transaction(trans); goto out; } /* We successfully logged the inode, attempt to sync the log. */ if (!ret) { ret = btrfs_sync_log(trans, root, &ctx); if (!ret) { ret = btrfs_end_transaction(trans); goto out; } } /* * At this point we need to commit the transaction because we had * btrfs_need_log_full_commit() or some other error. * * If we didn't do a full sync we have to stop the trans handle, wait on * the ordered extents, start it again and commit the transaction. If * we attempt to wait on the ordered extents here we could deadlock with * something like fallocate() that is holding the extent lock trying to * start a transaction while some other thread is trying to commit the * transaction while we (fsync) are currently holding the transaction * open. */ if (!full_sync) { ret = btrfs_end_transaction(trans); if (ret) goto out; ret = btrfs_wait_ordered_range(inode, start, len); if (ret) goto out; /* * This is safe to use here because we're only interested in * making sure the transaction that had the ordered extents is * committed. We aren't waiting on anything past this point, * we're purely getting the transaction and committing it. */ trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); /* * We committed the transaction and there's no currently * running transaction, this means everything we care * about made it to disk and we are done. */ if (ret == -ENOENT) ret = 0; goto out; } } ret = btrfs_commit_transaction(trans); out: free_extent_buffer(ctx.scratch_eb); ASSERT(list_empty(&ctx.list)); ASSERT(list_empty(&ctx.conflict_inodes)); err = file_check_and_advance_wb_err(file); if (!ret) ret = err; return ret > 0 ? -EIO : ret; out_release_extents: btrfs_release_log_ctx_extents(&ctx); if (skip_ilock) up_write(&inode->i_mmap_lock); else btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); goto out; } /* * btrfs_page_mkwrite() is not allowed to change the file size as it gets * called from a page fault handler when a page is first dirtied. Hence we must * be careful to check for EOF conditions here. We set the page up correctly * for a written page which means we get ENOSPC checking when writing into * holes and correct delalloc and unwritten extent mapping on filesystems that * support these features. * * We are not allowed to take the i_mutex here so we have to play games to * protect against truncate races as the page could now be beyond EOF. Because * truncate_setsize() writes the inode size before removing pages, once we have * the page lock we can determine safely if the page is beyond EOF. If it is not * beyond EOF, then the page is guaranteed safe against truncation until we * unlock the page. */ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; struct folio *folio = page_folio(page); struct btrfs_inode *inode = BTRFS_I(file_inode(vmf->vma->vm_file)); struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_io_tree *io_tree = &inode->io_tree; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; unsigned long zero_start; loff_t size; size_t fsize = folio_size(folio); int ret; bool only_release_metadata = false; u64 reserved_space; u64 page_start; u64 page_end; u64 end; reserved_space = fsize; sb_start_pagefault(inode->vfs_inode.i_sb); page_start = folio_pos(folio); page_end = page_start + folio_size(folio) - 1; end = page_end; /* * Reserving delalloc space after obtaining the page lock can lead to * deadlock. For example, if a dirty page is locked by this function * and the call to btrfs_delalloc_reserve_space() ends up triggering * dirty page write out, then the btrfs_writepages() function could * end up waiting indefinitely to get a lock on the page currently * being processed by btrfs_page_mkwrite() function. */ ret = btrfs_check_data_free_space(inode, &data_reserved, page_start, reserved_space, false); if (ret < 0) { size_t write_bytes = reserved_space; if (btrfs_check_nocow_lock(inode, page_start, &write_bytes, false) <= 0) goto out_noreserve; only_release_metadata = true; /* * Can't write the whole range, there may be shared extents or * holes in the range, bail out with @only_release_metadata set * to true so that we unlock the nocow lock before returning the * error. */ if (write_bytes < reserved_space) goto out_noreserve; } ret = btrfs_delalloc_reserve_metadata(inode, reserved_space, reserved_space, false); if (ret < 0) { if (!only_release_metadata) btrfs_free_reserved_data_space(inode, data_reserved, page_start, reserved_space); goto out_noreserve; } ret = file_update_time(vmf->vma->vm_file); if (ret < 0) goto out; again: down_read(&inode->i_mmap_lock); folio_lock(folio); size = i_size_read(&inode->vfs_inode); if ((folio->mapping != inode->vfs_inode.i_mapping) || (page_start >= size)) { /* Page got truncated out from underneath us. */ goto out_unlock; } folio_wait_writeback(folio); btrfs_lock_extent(io_tree, page_start, page_end, &cached_state); ret = set_folio_extent_mapped(folio); if (ret < 0) { btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state); goto out_unlock; } /* * We can't set the delalloc bits if there are pending ordered * extents. Drop our locks and wait for them to finish. */ ordered = btrfs_lookup_ordered_range(inode, page_start, fsize); if (ordered) { btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state); folio_unlock(folio); up_read(&inode->i_mmap_lock); btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); goto again; } if (folio_contains(folio, (size - 1) >> PAGE_SHIFT)) { reserved_space = round_up(size - page_start, fs_info->sectorsize); if (reserved_space < fsize) { const u64 to_free = fsize - reserved_space; end = page_start + reserved_space - 1; if (only_release_metadata) btrfs_delalloc_release_metadata(inode, to_free, true); else btrfs_delalloc_release_space(inode, data_reserved, end + 1, to_free, true); } } /* * page_mkwrite gets called when the page is firstly dirtied after it's * faulted in, but write(2) could also dirty a page and set delalloc * bits, thus in this case for space account reason, we still need to * clear any delalloc bits within this page range since we have to * reserve data&meta space before lock_page() (see above comments). */ btrfs_clear_extent_bit(io_tree, page_start, end, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, &cached_state); ret = btrfs_set_extent_delalloc(inode, page_start, end, 0, &cached_state); if (ret < 0) { btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state); goto out_unlock; } /* Page is wholly or partially inside EOF. */ if (page_start + folio_size(folio) > size) zero_start = offset_in_folio(folio, size); else zero_start = fsize; if (zero_start != fsize) folio_zero_range(folio, zero_start, folio_size(folio) - zero_start); btrfs_folio_clear_checked(fs_info, folio, page_start, fsize); btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start); btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start); btrfs_set_inode_last_sub_trans(inode); if (only_release_metadata) btrfs_set_extent_bit(io_tree, page_start, end, EXTENT_NORESERVE, &cached_state); btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state); up_read(&inode->i_mmap_lock); btrfs_delalloc_release_extents(inode, fsize); if (only_release_metadata) btrfs_check_nocow_unlock(inode); sb_end_pagefault(inode->vfs_inode.i_sb); extent_changeset_free(data_reserved); return VM_FAULT_LOCKED; out_unlock: folio_unlock(folio); up_read(&inode->i_mmap_lock); out: btrfs_delalloc_release_extents(inode, fsize); if (only_release_metadata) btrfs_delalloc_release_metadata(inode, reserved_space, true); else btrfs_delalloc_release_space(inode, data_reserved, page_start, reserved_space, true); extent_changeset_free(data_reserved); out_noreserve: if (only_release_metadata) btrfs_check_nocow_unlock(inode); sb_end_pagefault(inode->vfs_inode.i_sb); if (ret < 0) return vmf_error(ret); /* Make the VM retry the fault. */ return VM_FAULT_NOPAGE; } static const struct vm_operations_struct btrfs_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = btrfs_page_mkwrite, }; static int btrfs_file_mmap_prepare(struct vm_area_desc *desc) { struct file *filp = desc->file; struct address_space *mapping = filp->f_mapping; if (!mapping->a_ops->read_folio) return -ENOEXEC; file_accessed(filp); desc->vm_ops = &btrfs_file_vm_ops; return 0; } static bool hole_mergeable(struct btrfs_inode *inode, struct extent_buffer *leaf, int slot, u64 start, u64 end) { struct btrfs_file_extent_item *fi; struct btrfs_key key; if (slot < 0 || slot >= btrfs_header_nritems(leaf)) return false; btrfs_item_key_to_cpu(leaf, &key, slot); if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) return false; fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG) return false; if (btrfs_file_extent_disk_bytenr(leaf, fi)) return false; if (key.offset == end) return true; if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start) return true; return false; } static int fill_holes(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_path *path, u64 offset, u64 end) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root = inode->root; struct extent_buffer *leaf; struct btrfs_file_extent_item *fi; struct extent_map *hole_em; struct btrfs_key key; int ret; if (btrfs_fs_incompat(fs_info, NO_HOLES)) goto out; key.objectid = btrfs_ino(inode); key.type = BTRFS_EXTENT_DATA_KEY; key.offset = offset; ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret <= 0) { /* * We should have dropped this offset, so if we find it then * something has gone horribly wrong. */ if (ret == 0) ret = -EINVAL; return ret; } leaf = path->nodes[0]; if (hole_mergeable(inode, leaf, path->slots[0] - 1, offset, end)) { u64 num_bytes; path->slots[0]--; fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end - offset; btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); btrfs_set_file_extent_offset(leaf, fi, 0); btrfs_set_file_extent_generation(leaf, fi, trans->transid); goto out; } if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) { u64 num_bytes; key.offset = offset; btrfs_set_item_key_safe(trans, path, &key); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end - offset; btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); btrfs_set_file_extent_offset(leaf, fi, 0); btrfs_set_file_extent_generation(leaf, fi, trans->transid); goto out; } btrfs_release_path(path); ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset, end - offset); if (ret) return ret; out: btrfs_release_path(path); hole_em = btrfs_alloc_extent_map(); if (!hole_em) { btrfs_drop_extent_map_range(inode, offset, end - 1, false); btrfs_set_inode_full_sync(inode); } else { hole_em->start = offset; hole_em->len = end - offset; hole_em->ram_bytes = hole_em->len; hole_em->disk_bytenr = EXTENT_MAP_HOLE; hole_em->disk_num_bytes = 0; hole_em->generation = trans->transid; ret = btrfs_replace_extent_map_range(inode, hole_em, true); btrfs_free_extent_map(hole_em); if (ret) btrfs_set_inode_full_sync(inode); } return 0; } /* * Find a hole extent on given inode and change start/len to the end of hole * extent.(hole/vacuum extent whose em->start <= start && * em->start + em->len > start) * When a hole extent is found, return 1 and modify start/len. */ static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_map *em; int ret = 0; em = btrfs_get_extent(inode, NULL, round_down(*start, fs_info->sectorsize), round_up(*len, fs_info->sectorsize)); if (IS_ERR(em)) return PTR_ERR(em); /* Hole or vacuum extent(only exists in no-hole mode) */ if (em->disk_bytenr == EXTENT_MAP_HOLE) { ret = 1; *len = em->start + em->len > *start + *len ? 0 : *start + *len - em->start - em->len; *start = em->start + em->len; } btrfs_free_extent_map(em); return ret; } /* * Check if there is no folio in the range. * * We cannot utilize filemap_range_has_page() in a filemap with large folios * as we can hit the following false positive: * * start end * | | * |//|//|//|//| | | | | | | | |//|//| * \ / \ / * Folio A Folio B * * That large folio A and B cover the start and end indexes. * In that case filemap_range_has_page() will always return true, but the above * case is fine for btrfs_punch_hole_lock_range() usage. * * So here we only ensure that no other folios is in the range, excluding the * head/tail large folio. */ static bool check_range_has_page(struct inode *inode, u64 start, u64 end) { struct folio_batch fbatch; bool ret = false; /* * For subpage case, if the range is not at page boundary, we could * have pages at the leading/tailing part of the range. * This could lead to dead loop since filemap_range_has_page() * will always return true. * So here we need to do extra page alignment for * filemap_range_has_page(). * * And do not decrease page_lockend right now, as it can be 0. */ const u64 page_lockstart = round_up(start, PAGE_SIZE); const u64 page_lockend = round_down(end + 1, PAGE_SIZE); const pgoff_t start_index = page_lockstart >> PAGE_SHIFT; const pgoff_t end_index = (page_lockend - 1) >> PAGE_SHIFT; pgoff_t tmp = start_index; int found_folios; /* The same page or adjacent pages. */ if (page_lockend <= page_lockstart) return false; folio_batch_init(&fbatch); found_folios = filemap_get_folios(inode->i_mapping, &tmp, end_index, &fbatch); for (int i = 0; i < found_folios; i++) { struct folio *folio = fbatch.folios[i]; /* A large folio begins before the start. Not a target. */ if (folio->index < start_index) continue; /* A large folio extends beyond the end. Not a target. */ if (folio_next_index(folio) > end_index) continue; /* A folio doesn't cover the head/tail index. Found a target. */ ret = true; break; } folio_batch_release(&fbatch); return ret; } static void btrfs_punch_hole_lock_range(struct inode *inode, const u64 lockstart, const u64 lockend, struct extent_state **cached_state) { while (1) { truncate_pagecache_range(inode, lockstart, lockend); btrfs_lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, cached_state); /* * We can't have ordered extents in the range, nor dirty/writeback * pages, because we have locked the inode's VFS lock in exclusive * mode, we have locked the inode's i_mmap_lock in exclusive mode, * we have flushed all delalloc in the range and we have waited * for any ordered extents in the range to complete. * We can race with anyone reading pages from this range, so after * locking the range check if we have pages in the range, and if * we do, unlock the range and retry. */ if (!check_range_has_page(inode, lockstart, lockend)) break; btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, cached_state); } btrfs_assert_inode_range_clean(BTRFS_I(inode), lockstart, lockend); } static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_path *path, struct btrfs_replace_extent_info *extent_info, const u64 replace_len, const u64 bytes_to_drop) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root = inode->root; struct btrfs_file_extent_item *extent; struct extent_buffer *leaf; struct btrfs_key key; int slot; int ret; if (replace_len == 0) return 0; if (extent_info->disk_offset == 0 && btrfs_fs_incompat(fs_info, NO_HOLES)) { btrfs_update_inode_bytes(inode, 0, bytes_to_drop); return 0; } key.objectid = btrfs_ino(inode); key.type = BTRFS_EXTENT_DATA_KEY; key.offset = extent_info->file_offset; ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(struct btrfs_file_extent_item)); if (ret) return ret; leaf = path->nodes[0]; slot = path->slots[0]; write_extent_buffer(leaf, extent_info->extent_buf, btrfs_item_ptr_offset(leaf, slot), sizeof(struct btrfs_file_extent_item)); extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); ASSERT(btrfs_file_extent_type(leaf, extent) != BTRFS_FILE_EXTENT_INLINE); btrfs_set_file_extent_offset(leaf, extent, extent_info->data_offset); btrfs_set_file_extent_num_bytes(leaf, extent, replace_len); if (extent_info->is_new_extent) btrfs_set_file_extent_generation(leaf, extent, trans->transid); btrfs_release_path(path); ret = btrfs_inode_set_file_extent_range(inode, extent_info->file_offset, replace_len); if (ret) return ret; /* If it's a hole, nothing more needs to be done. */ if (extent_info->disk_offset == 0) { btrfs_update_inode_bytes(inode, 0, bytes_to_drop); return 0; } btrfs_update_inode_bytes(inode, replace_len, bytes_to_drop); if (extent_info->is_new_extent && extent_info->insertions == 0) { key.objectid = extent_info->disk_offset; key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = extent_info->disk_len; ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode), extent_info->file_offset, extent_info->qgroup_reserved, &key); } else { struct btrfs_ref ref = { .action = BTRFS_ADD_DELAYED_REF, .bytenr = extent_info->disk_offset, .num_bytes = extent_info->disk_len, .owning_root = btrfs_root_id(root), .ref_root = btrfs_root_id(root), }; u64 ref_offset; ref_offset = extent_info->file_offset - extent_info->data_offset; btrfs_init_data_ref(&ref, btrfs_ino(inode), ref_offset, 0, false); ret = btrfs_inc_extent_ref(trans, &ref); } extent_info->insertions++; return ret; } /* * The respective range must have been previously locked, as well as the inode. * The end offset is inclusive (last byte of the range). * @extent_info is NULL for fallocate's hole punching and non-NULL when replacing * the file range with an extent. * When not punching a hole, we don't want to end up in a state where we dropped * extents without inserting a new one, so we must abort the transaction to avoid * a corruption. */ int btrfs_replace_file_extents(struct btrfs_inode *inode, struct btrfs_path *path, const u64 start, const u64 end, struct btrfs_replace_extent_info *extent_info, struct btrfs_trans_handle **trans_out) { struct btrfs_drop_extents_args drop_args = { 0 }; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1); u64 ino_size = round_up(inode->vfs_inode.i_size, fs_info->sectorsize); struct btrfs_trans_handle *trans = NULL; struct btrfs_block_rsv rsv; unsigned int rsv_count; u64 cur_offset; u64 len = end - start; int ret = 0; if (end <= start) return -EINVAL; btrfs_init_metadata_block_rsv(fs_info, &rsv, BTRFS_BLOCK_RSV_TEMP); rsv.size = btrfs_calc_insert_metadata_size(fs_info, 1); rsv.failfast = true; /* * 1 - update the inode * 1 - removing the extents in the range * 1 - adding the hole extent if no_holes isn't set or if we are * replacing the range with a new extent */ if (!btrfs_fs_incompat(fs_info, NO_HOLES) || extent_info) rsv_count = 3; else rsv_count = 2; trans = btrfs_start_transaction(root, rsv_count); if (IS_ERR(trans)) { ret = PTR_ERR(trans); trans = NULL; goto out_release; } ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, &rsv, min_size, false); if (WARN_ON(ret)) goto out_trans; trans->block_rsv = &rsv; cur_offset = start; drop_args.path = path; drop_args.end = end + 1; drop_args.drop_cache = true; while (cur_offset < end) { drop_args.start = cur_offset; ret = btrfs_drop_extents(trans, root, inode, &drop_args); /* If we are punching a hole decrement the inode's byte count */ if (!extent_info) btrfs_update_inode_bytes(inode, 0, drop_args.bytes_found); if (ret != -ENOSPC) { /* * The only time we don't want to abort is if we are * attempting to clone a partial inline extent, in which * case we'll get EOPNOTSUPP. However if we aren't * clone we need to abort no matter what, because if we * got EOPNOTSUPP via prealloc then we messed up and * need to abort. */ if (unlikely(ret && (ret != -EOPNOTSUPP || (extent_info && extent_info->is_new_extent)))) btrfs_abort_transaction(trans, ret); break; } trans->block_rsv = &fs_info->trans_block_rsv; if (!extent_info && cur_offset < drop_args.drop_end && cur_offset < ino_size) { ret = fill_holes(trans, inode, path, cur_offset, drop_args.drop_end); if (unlikely(ret)) { /* * If we failed then we didn't insert our hole * entries for the area we dropped, so now the * fs is corrupted, so we must abort the * transaction. */ btrfs_abort_transaction(trans, ret); break; } } else if (!extent_info && cur_offset < drop_args.drop_end) { /* * We are past the i_size here, but since we didn't * insert holes we need to clear the mapped area so we * know to not set disk_i_size in this area until a new * file extent is inserted here. */ ret = btrfs_inode_clear_file_extent_range(inode, cur_offset, drop_args.drop_end - cur_offset); if (unlikely(ret)) { /* * We couldn't clear our area, so we could * presumably adjust up and corrupt the fs, so * we need to abort. */ btrfs_abort_transaction(trans, ret); break; } } if (extent_info && drop_args.drop_end > extent_info->file_offset) { u64 replace_len = drop_args.drop_end - extent_info->file_offset; ret = btrfs_insert_replace_extent(trans, inode, path, extent_info, replace_len, drop_args.bytes_found); if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } extent_info->data_len -= replace_len; extent_info->data_offset += replace_len; extent_info->file_offset += replace_len; } /* * We are releasing our handle on the transaction, balance the * dirty pages of the btree inode and flush delayed items, and * then get a new transaction handle, which may now point to a * new transaction in case someone else may have committed the * transaction we used to replace/drop file extent items. So * bump the inode's iversion and update mtime and ctime except * if we are called from a dedupe context. This is because a * power failure/crash may happen after the transaction is * committed and before we finish replacing/dropping all the * file extent items we need. */ inode_inc_iversion(&inode->vfs_inode); if (!extent_info || extent_info->update_times) inode_set_mtime_to_ts(&inode->vfs_inode, inode_set_ctime_current(&inode->vfs_inode)); ret = btrfs_update_inode(trans, inode); if (ret) break; btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); trans = btrfs_start_transaction(root, rsv_count); if (IS_ERR(trans)) { ret = PTR_ERR(trans); trans = NULL; break; } ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, &rsv, min_size, false); if (WARN_ON(ret)) break; trans->block_rsv = &rsv; cur_offset = drop_args.drop_end; len = end - cur_offset; if (!extent_info && len) { ret = find_first_non_hole(inode, &cur_offset, &len); if (unlikely(ret < 0)) break; if (ret && !len) { ret = 0; break; } } } /* * If we were cloning, force the next fsync to be a full one since we * we replaced (or just dropped in the case of cloning holes when * NO_HOLES is enabled) file extent items and did not setup new extent * maps for the replacement extents (or holes). */ if (extent_info && !extent_info->is_new_extent) btrfs_set_inode_full_sync(inode); if (ret) goto out_trans; trans->block_rsv = &fs_info->trans_block_rsv; /* * If we are using the NO_HOLES feature we might have had already an * hole that overlaps a part of the region [lockstart, lockend] and * ends at (or beyond) lockend. Since we have no file extent items to * represent holes, drop_end can be less than lockend and so we must * make sure we have an extent map representing the existing hole (the * call to __btrfs_drop_extents() might have dropped the existing extent * map representing the existing hole), otherwise the fast fsync path * will not record the existence of the hole region * [existing_hole_start, lockend]. */ if (drop_args.drop_end <= end) drop_args.drop_end = end + 1; /* * Don't insert file hole extent item if it's for a range beyond eof * (because it's useless) or if it represents a 0 bytes range (when * cur_offset == drop_end). */ if (!extent_info && cur_offset < ino_size && cur_offset < drop_args.drop_end) { ret = fill_holes(trans, inode, path, cur_offset, drop_args.drop_end); if (unlikely(ret)) { /* Same comment as above. */ btrfs_abort_transaction(trans, ret); goto out_trans; } } else if (!extent_info && cur_offset < drop_args.drop_end) { /* See the comment in the loop above for the reasoning here. */ ret = btrfs_inode_clear_file_extent_range(inode, cur_offset, drop_args.drop_end - cur_offset); if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_trans; } } if (extent_info) { ret = btrfs_insert_replace_extent(trans, inode, path, extent_info, extent_info->data_len, drop_args.bytes_found); if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_trans; } } out_trans: if (!trans) goto out_release; trans->block_rsv = &fs_info->trans_block_rsv; if (ret) btrfs_end_transaction(trans); else *trans_out = trans; out_release: btrfs_block_rsv_release(fs_info, &rsv, (u64)-1, NULL); return ret; } static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_state *cached_state = NULL; struct btrfs_path *path; struct btrfs_trans_handle *trans = NULL; u64 lockstart; u64 lockend; u64 tail_start; u64 tail_len; const u64 orig_start = offset; const u64 orig_end = offset + len - 1; int ret = 0; bool same_block; u64 ino_size; bool truncated_block = false; bool updated_inode = false; btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); ret = btrfs_wait_ordered_range(BTRFS_I(inode), offset, len); if (ret) goto out_only_mutex; ino_size = round_up(inode->i_size, fs_info->sectorsize); ret = find_first_non_hole(BTRFS_I(inode), &offset, &len); if (ret < 0) goto out_only_mutex; if (ret && !len) { /* Already in a large hole */ ret = 0; goto out_only_mutex; } ret = file_modified(file); if (ret) goto out_only_mutex; lockstart = round_up(offset, fs_info->sectorsize); lockend = round_down(offset + len, fs_info->sectorsize) - 1; same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset)) == (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)); /* * Only do this if we are in the same block and we aren't doing the * entire block. */ if (same_block && len < fs_info->sectorsize) { if (offset < ino_size) { truncated_block = true; ret = btrfs_truncate_block(BTRFS_I(inode), offset + len - 1, orig_start, orig_end); } else { ret = 0; } goto out_only_mutex; } /* zero back part of the first block */ if (offset < ino_size) { truncated_block = true; ret = btrfs_truncate_block(BTRFS_I(inode), offset, orig_start, orig_end); if (ret) { btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); return ret; } } /* Check the aligned pages after the first unaligned page, * if offset != orig_start, which means the first unaligned page * including several following pages are already in holes, * the extra check can be skipped */ if (offset == orig_start) { /* after truncate page, check hole again */ len = offset + len - lockstart; offset = lockstart; ret = find_first_non_hole(BTRFS_I(inode), &offset, &len); if (ret < 0) goto out_only_mutex; if (ret && !len) { ret = 0; goto out_only_mutex; } lockstart = offset; } /* Check the tail unaligned part is in a hole */ tail_start = lockend + 1; tail_len = offset + len - tail_start; if (tail_len) { ret = find_first_non_hole(BTRFS_I(inode), &tail_start, &tail_len); if (unlikely(ret < 0)) goto out_only_mutex; if (!ret) { /* zero the front end of the last page */ if (tail_start + tail_len < ino_size) { truncated_block = true; ret = btrfs_truncate_block(BTRFS_I(inode), tail_start + tail_len - 1, orig_start, orig_end); if (ret) goto out_only_mutex; } } } if (lockend < lockstart) { ret = 0; goto out_only_mutex; } btrfs_punch_hole_lock_range(inode, lockstart, lockend, &cached_state); path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out; } ret = btrfs_replace_file_extents(BTRFS_I(inode), path, lockstart, lockend, NULL, &trans); btrfs_free_path(path); if (ret) goto out; ASSERT(trans != NULL); inode_inc_iversion(inode); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); ret = btrfs_update_inode(trans, BTRFS_I(inode)); updated_inode = true; btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); out: btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state); out_only_mutex: if (!updated_inode && truncated_block && !ret) { /* * If we only end up zeroing part of a page, we still need to * update the inode item, so that all the time fields are * updated as well as the necessary btrfs inode in memory fields * for detecting, at fsync time, if the inode isn't yet in the * log tree or it's there but not up to date. */ struct timespec64 now = inode_set_ctime_current(inode); inode_inc_iversion(inode); inode_set_mtime_to_ts(inode, now); trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); } else { int ret2; ret = btrfs_update_inode(trans, BTRFS_I(inode)); ret2 = btrfs_end_transaction(trans); if (!ret) ret = ret2; } } btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); return ret; } /* Helper structure to record which range is already reserved */ struct falloc_range { struct list_head list; u64 start; u64 len; }; /* * Helper function to add falloc range * * Caller should have locked the larger range of extent containing * [start, len) */ static int add_falloc_range(struct list_head *head, u64 start, u64 len) { struct falloc_range *range = NULL; if (!list_empty(head)) { /* * As fallocate iterates by bytenr order, we only need to check * the last range. */ range = list_last_entry(head, struct falloc_range, list); if (range->start + range->len == start) { range->len += len; return 0; } } range = kmalloc(sizeof(*range), GFP_KERNEL); if (!range) return -ENOMEM; range->start = start; range->len = len; list_add_tail(&range->list, head); return 0; } static int btrfs_fallocate_update_isize(struct inode *inode, const u64 end, const int mode) { struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; int ret; int ret2; if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode)) return 0; trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) return PTR_ERR(trans); inode_set_ctime_current(inode); i_size_write(inode, end); btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); ret = btrfs_update_inode(trans, BTRFS_I(inode)); ret2 = btrfs_end_transaction(trans); return ret ? ret : ret2; } enum { RANGE_BOUNDARY_WRITTEN_EXTENT, RANGE_BOUNDARY_PREALLOC_EXTENT, RANGE_BOUNDARY_HOLE, }; static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode, u64 offset) { const u64 sectorsize = inode->root->fs_info->sectorsize; struct extent_map *em; int ret; offset = round_down(offset, sectorsize); em = btrfs_get_extent(inode, NULL, offset, sectorsize); if (IS_ERR(em)) return PTR_ERR(em); if (em->disk_bytenr == EXTENT_MAP_HOLE) ret = RANGE_BOUNDARY_HOLE; else if (em->flags & EXTENT_FLAG_PREALLOC) ret = RANGE_BOUNDARY_PREALLOC_EXTENT; else ret = RANGE_BOUNDARY_WRITTEN_EXTENT; btrfs_free_extent_map(em); return ret; } static int btrfs_zero_range(struct inode *inode, loff_t offset, loff_t len, const int mode) { struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; struct extent_map *em; struct extent_changeset *data_reserved = NULL; int ret; u64 alloc_hint = 0; const u64 sectorsize = fs_info->sectorsize; const u64 orig_start = offset; const u64 orig_end = offset + len - 1; u64 alloc_start = round_down(offset, sectorsize); u64 alloc_end = round_up(offset + len, sectorsize); u64 bytes_to_reserve = 0; bool space_reserved = false; em = btrfs_get_extent(BTRFS_I(inode), NULL, alloc_start, alloc_end - alloc_start); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out; } /* * Avoid hole punching and extent allocation for some cases. More cases * could be considered, but these are unlikely common and we keep things * as simple as possible for now. Also, intentionally, if the target * range contains one or more prealloc extents together with regular * extents and holes, we drop all the existing extents and allocate a * new prealloc extent, so that we get a larger contiguous disk extent. */ if (em->start <= alloc_start && (em->flags & EXTENT_FLAG_PREALLOC)) { const u64 em_end = em->start + em->len; if (em_end >= offset + len) { /* * The whole range is already a prealloc extent, * do nothing except updating the inode's i_size if * needed. */ btrfs_free_extent_map(em); ret = btrfs_fallocate_update_isize(inode, offset + len, mode); goto out; } /* * Part of the range is already a prealloc extent, so operate * only on the remaining part of the range. */ alloc_start = em_end; ASSERT(IS_ALIGNED(alloc_start, sectorsize)); len = offset + len - alloc_start; offset = alloc_start; alloc_hint = btrfs_extent_map_block_start(em) + em->len; } btrfs_free_extent_map(em); if (BTRFS_BYTES_TO_BLKS(fs_info, offset) == BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) { em = btrfs_get_extent(BTRFS_I(inode), NULL, alloc_start, sectorsize); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out; } if (em->flags & EXTENT_FLAG_PREALLOC) { btrfs_free_extent_map(em); ret = btrfs_fallocate_update_isize(inode, offset + len, mode); goto out; } if (len < sectorsize && em->disk_bytenr != EXTENT_MAP_HOLE) { btrfs_free_extent_map(em); ret = btrfs_truncate_block(BTRFS_I(inode), offset + len - 1, orig_start, orig_end); if (!ret) ret = btrfs_fallocate_update_isize(inode, offset + len, mode); return ret; } btrfs_free_extent_map(em); alloc_start = round_down(offset, sectorsize); alloc_end = alloc_start + sectorsize; goto reserve_space; } alloc_start = round_up(offset, sectorsize); alloc_end = round_down(offset + len, sectorsize); /* * For unaligned ranges, check the pages at the boundaries, they might * map to an extent, in which case we need to partially zero them, or * they might map to a hole, in which case we need our allocation range * to cover them. */ if (!IS_ALIGNED(offset, sectorsize)) { ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode), offset); if (ret < 0) goto out; if (ret == RANGE_BOUNDARY_HOLE) { alloc_start = round_down(offset, sectorsize); ret = 0; } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) { ret = btrfs_truncate_block(BTRFS_I(inode), offset, orig_start, orig_end); if (ret) goto out; } else { ret = 0; } } if (!IS_ALIGNED(offset + len, sectorsize)) { ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode), offset + len); if (ret < 0) goto out; if (ret == RANGE_BOUNDARY_HOLE) { alloc_end = round_up(offset + len, sectorsize); ret = 0; } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) { ret = btrfs_truncate_block(BTRFS_I(inode), offset + len - 1, orig_start, orig_end); if (ret) goto out; } else { ret = 0; } } reserve_space: if (alloc_start < alloc_end) { struct extent_state *cached_state = NULL; const u64 lockstart = alloc_start; const u64 lockend = alloc_end - 1; bytes_to_reserve = alloc_end - alloc_start; ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), bytes_to_reserve); if (ret < 0) goto out; space_reserved = true; btrfs_punch_hole_lock_range(inode, lockstart, lockend, &cached_state); ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved, alloc_start, bytes_to_reserve); if (ret) { btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state); goto out; } ret = btrfs_prealloc_file_range(inode, mode, alloc_start, alloc_end - alloc_start, fs_info->sectorsize, offset + len, &alloc_hint); btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state); /* btrfs_prealloc_file_range releases reserved space on error */ if (ret) { space_reserved = false; goto out; } } ret = btrfs_fallocate_update_isize(inode, offset + len, mode); out: if (ret && space_reserved) btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved, alloc_start, bytes_to_reserve); extent_changeset_free(data_reserved); return ret; } static long btrfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; struct falloc_range *range; struct falloc_range *tmp; LIST_HEAD(reserve_list); u64 cur_offset; u64 last_byte; u64 alloc_start; u64 alloc_end; u64 alloc_hint = 0; u64 locked_end; u64 actual_end = 0; u64 data_space_needed = 0; u64 data_space_reserved = 0; u64 qgroup_reserved = 0; struct extent_map *em; int blocksize = BTRFS_I(inode)->root->fs_info->sectorsize; int ret; /* Do not allow fallocate in ZONED mode */ if (btrfs_is_zoned(inode_to_fs_info(inode))) return -EOPNOTSUPP; alloc_start = round_down(offset, blocksize); alloc_end = round_up(offset + len, blocksize); cur_offset = alloc_start; /* Make sure we aren't being give some crap mode */ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) return -EOPNOTSUPP; if (mode & FALLOC_FL_PUNCH_HOLE) return btrfs_punch_hole(file, offset, len); btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) { ret = inode_newsize_ok(inode, offset + len); if (ret) goto out; } ret = file_modified(file); if (ret) goto out; /* * TODO: Move these two operations after we have checked * accurate reserved space, or fallocate can still fail but * with page truncated or size expanded. * * But that's a minor problem and won't do much harm BTW. */ if (alloc_start > inode->i_size) { ret = btrfs_cont_expand(BTRFS_I(inode), i_size_read(inode), alloc_start); if (ret) goto out; } else if (offset + len > inode->i_size) { /* * If we are fallocating from the end of the file onward we * need to zero out the end of the block if i_size lands in the * middle of a block. */ ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, inode->i_size, (u64)-1); if (ret) goto out; } /* * We have locked the inode at the VFS level (in exclusive mode) and we * have locked the i_mmap_lock lock (in exclusive mode). Now before * locking the file range, flush all dealloc in the range and wait for * all ordered extents in the range to complete. After this we can lock * the file range and, due to the previous locking we did, we know there * can't be more delalloc or ordered extents in the range. */ ret = btrfs_wait_ordered_range(BTRFS_I(inode), alloc_start, alloc_end - alloc_start); if (ret) goto out; if (mode & FALLOC_FL_ZERO_RANGE) { ret = btrfs_zero_range(inode, offset, len, mode); btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); return ret; } locked_end = alloc_end - 1; btrfs_lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, &cached_state); btrfs_assert_inode_range_clean(BTRFS_I(inode), alloc_start, locked_end); /* First, check if we exceed the qgroup limit */ while (cur_offset < alloc_end) { em = btrfs_get_extent(BTRFS_I(inode), NULL, cur_offset, alloc_end - cur_offset); if (IS_ERR(em)) { ret = PTR_ERR(em); break; } last_byte = min(btrfs_extent_map_end(em), alloc_end); actual_end = min_t(u64, btrfs_extent_map_end(em), offset + len); last_byte = ALIGN(last_byte, blocksize); if (em->disk_bytenr == EXTENT_MAP_HOLE || (cur_offset >= inode->i_size && !(em->flags & EXTENT_FLAG_PREALLOC))) { const u64 range_len = last_byte - cur_offset; ret = add_falloc_range(&reserve_list, cur_offset, range_len); if (ret < 0) { btrfs_free_extent_map(em); break; } ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved, cur_offset, range_len); if (ret < 0) { btrfs_free_extent_map(em); break; } qgroup_reserved += range_len; data_space_needed += range_len; } btrfs_free_extent_map(em); cur_offset = last_byte; } if (!ret && data_space_needed > 0) { /* * We are safe to reserve space here as we can't have delalloc * in the range, see above. */ ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), data_space_needed); if (!ret) data_space_reserved = data_space_needed; } /* * If ret is still 0, means we're OK to fallocate. * Or just cleanup the list and exit. */ list_for_each_entry_safe(range, tmp, &reserve_list, list) { if (!ret) { ret = btrfs_prealloc_file_range(inode, mode, range->start, range->len, blocksize, offset + len, &alloc_hint); /* * btrfs_prealloc_file_range() releases space even * if it returns an error. */ data_space_reserved -= range->len; qgroup_reserved -= range->len; } else if (data_space_reserved > 0) { btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved, range->start, range->len); data_space_reserved -= range->len; qgroup_reserved -= range->len; } else if (qgroup_reserved > 0) { btrfs_qgroup_free_data(BTRFS_I(inode), data_reserved, range->start, range->len, NULL); qgroup_reserved -= range->len; } list_del(&range->list); kfree(range); } if (ret < 0) goto out_unlock; /* * We didn't need to allocate any more space, but we still extended the * size of the file so we need to update i_size and the inode item. */ ret = btrfs_fallocate_update_isize(inode, actual_end, mode); out_unlock: btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, &cached_state); out: btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); extent_changeset_free(data_reserved); return ret; } /* * Helper for btrfs_find_delalloc_in_range(). Find a subrange in a given range * that has unflushed and/or flushing delalloc. There might be other adjacent * subranges after the one it found, so btrfs_find_delalloc_in_range() keeps * looping while it gets adjacent subranges, and merging them together. */ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end, struct extent_state **cached_state, bool *search_io_tree, u64 *delalloc_start_ret, u64 *delalloc_end_ret) { u64 len = end + 1 - start; u64 delalloc_len = 0; struct btrfs_ordered_extent *oe; u64 oe_start; u64 oe_end; /* * Search the io tree first for EXTENT_DELALLOC. If we find any, it * means we have delalloc (dirty pages) for which writeback has not * started yet. */ if (*search_io_tree) { spin_lock(&inode->lock); if (inode->delalloc_bytes > 0) { spin_unlock(&inode->lock); *delalloc_start_ret = start; delalloc_len = btrfs_count_range_bits(&inode->io_tree, delalloc_start_ret, end, len, EXTENT_DELALLOC, 1, cached_state); } else { spin_unlock(&inode->lock); } } if (delalloc_len > 0) { /* * If delalloc was found then *delalloc_start_ret has a sector size * aligned value (rounded down). */ *delalloc_end_ret = *delalloc_start_ret + delalloc_len - 1; if (*delalloc_start_ret == start) { /* Delalloc for the whole range, nothing more to do. */ if (*delalloc_end_ret == end) return true; /* Else trim our search range for ordered extents. */ start = *delalloc_end_ret + 1; len = end + 1 - start; } } else { /* No delalloc, future calls don't need to search again. */ *search_io_tree = false; } /* * Now also check if there's any ordered extent in the range. * We do this because: * * 1) When delalloc is flushed, the file range is locked, we clear the * EXTENT_DELALLOC bit from the io tree and create an extent map and * an ordered extent for the write. So we might just have been called * after delalloc is flushed and before the ordered extent completes * and inserts the new file extent item in the subvolume's btree; * * 2) We may have an ordered extent created by flushing delalloc for a * subrange that starts before the subrange we found marked with * EXTENT_DELALLOC in the io tree. * * We could also use the extent map tree to find such delalloc that is * being flushed, but using the ordered extents tree is more efficient * because it's usually much smaller as ordered extents are removed from * the tree once they complete. With the extent maps, we may have them * in the extent map tree for a very long time, and they were either * created by previous writes or loaded by read operations. */ oe = btrfs_lookup_first_ordered_range(inode, start, len); if (!oe) return (delalloc_len > 0); /* The ordered extent may span beyond our search range. */ oe_start = max(oe->file_offset, start); oe_end = min(oe->file_offset + oe->num_bytes - 1, end); btrfs_put_ordered_extent(oe); /* Don't have unflushed delalloc, return the ordered extent range. */ if (delalloc_len == 0) { *delalloc_start_ret = oe_start; *delalloc_end_ret = oe_end; return true; } /* * We have both unflushed delalloc (io_tree) and an ordered extent. * If the ranges are adjacent returned a combined range, otherwise * return the leftmost range. */ if (oe_start < *delalloc_start_ret) { if (oe_end < *delalloc_start_ret) *delalloc_end_ret = oe_end; *delalloc_start_ret = oe_start; } else if (*delalloc_end_ret + 1 == oe_start) { *delalloc_end_ret = oe_end; } return true; } /* * Check if there's delalloc in a given range. * * @inode: The inode. * @start: The start offset of the range. It does not need to be * sector size aligned. * @end: The end offset (inclusive value) of the search range. * It does not need to be sector size aligned. * @cached_state: Extent state record used for speeding up delalloc * searches in the inode's io_tree. Can be NULL. * @delalloc_start_ret: Output argument, set to the start offset of the * subrange found with delalloc (may not be sector size * aligned). * @delalloc_end_ret: Output argument, set to he end offset (inclusive value) * of the subrange found with delalloc. * * Returns true if a subrange with delalloc is found within the given range, and * if so it sets @delalloc_start_ret and @delalloc_end_ret with the start and * end offsets of the subrange. */ bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end, struct extent_state **cached_state, u64 *delalloc_start_ret, u64 *delalloc_end_ret) { u64 cur_offset = round_down(start, inode->root->fs_info->sectorsize); u64 prev_delalloc_end = 0; bool search_io_tree = true; bool ret = false; while (cur_offset <= end) { u64 delalloc_start; u64 delalloc_end; bool delalloc; delalloc = find_delalloc_subrange(inode, cur_offset, end, cached_state, &search_io_tree, &delalloc_start, &delalloc_end); if (!delalloc) break; if (prev_delalloc_end == 0) { /* First subrange found. */ *delalloc_start_ret = max(delalloc_start, start); *delalloc_end_ret = delalloc_end; ret = true; } else if (delalloc_start == prev_delalloc_end + 1) { /* Subrange adjacent to the previous one, merge them. */ *delalloc_end_ret = delalloc_end; } else { /* Subrange not adjacent to the previous one, exit. */ break; } prev_delalloc_end = delalloc_end; cur_offset = delalloc_end + 1; cond_resched(); } return ret; } /* * Check if there's a hole or delalloc range in a range representing a hole (or * prealloc extent) found in the inode's subvolume btree. * * @inode: The inode. * @whence: Seek mode (SEEK_DATA or SEEK_HOLE). * @start: Start offset of the hole region. It does not need to be sector * size aligned. * @end: End offset (inclusive value) of the hole region. It does not * need to be sector size aligned. * @start_ret: Return parameter, used to set the start of the subrange in the * hole that matches the search criteria (seek mode), if such * subrange is found (return value of the function is true). * The value returned here may not be sector size aligned. * * Returns true if a subrange matching the given seek mode is found, and if one * is found, it updates @start_ret with the start of the subrange. */ static bool find_desired_extent_in_hole(struct btrfs_inode *inode, int whence, struct extent_state **cached_state, u64 start, u64 end, u64 *start_ret) { u64 delalloc_start; u64 delalloc_end; bool delalloc; delalloc = btrfs_find_delalloc_in_range(inode, start, end, cached_state, &delalloc_start, &delalloc_end); if (delalloc && whence == SEEK_DATA) { *start_ret = delalloc_start; return true; } if (delalloc && whence == SEEK_HOLE) { /* * We found delalloc but it starts after out start offset. So we * have a hole between our start offset and the delalloc start. */ if (start < delalloc_start) { *start_ret = start; return true; } /* * Delalloc range starts at our start offset. * If the delalloc range's length is smaller than our range, * then it means we have a hole that starts where the delalloc * subrange ends. */ if (delalloc_end < end) { *start_ret = delalloc_end + 1; return true; } /* There's delalloc for the whole range. */ return false; } if (!delalloc && whence == SEEK_HOLE) { *start_ret = start; return true; } /* * No delalloc in the range and we are seeking for data. The caller has * to iterate to the next extent item in the subvolume btree. */ return false; } static loff_t find_desired_extent(struct file *file, loff_t offset, int whence) { struct btrfs_inode *inode = BTRFS_I(file->f_mapping->host); struct btrfs_file_private *private; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_state *cached_state = NULL; struct extent_state **delalloc_cached_state; const loff_t i_size = i_size_read(&inode->vfs_inode); const u64 ino = btrfs_ino(inode); struct btrfs_root *root = inode->root; struct btrfs_path *path; struct btrfs_key key; u64 last_extent_end; u64 lockstart; u64 lockend; u64 start; int ret; bool found = false; if (i_size == 0 || offset >= i_size) return -ENXIO; /* * Quick path. If the inode has no prealloc extents and its number of * bytes used matches its i_size, then it can not have holes. */ if (whence == SEEK_HOLE && !(inode->flags & BTRFS_INODE_PREALLOC) && inode_get_bytes(&inode->vfs_inode) == i_size) return i_size; spin_lock(&inode->lock); private = file->private_data; spin_unlock(&inode->lock); if (private && private->owner_task != current) { /* * Not allocated by us, don't use it as its cached state is used * by the task that allocated it and we don't want neither to * mess with it nor get incorrect results because it reflects an * invalid state for the current task. */ private = NULL; } else if (!private) { private = kzalloc(sizeof(*private), GFP_KERNEL); /* * No worries if memory allocation failed. * The private structure is used only for speeding up multiple * lseek SEEK_HOLE/DATA calls to a file when there's delalloc, * so everything will still be correct. */ if (private) { bool free = false; private->owner_task = current; spin_lock(&inode->lock); if (file->private_data) free = true; else file->private_data = private; spin_unlock(&inode->lock); if (free) { kfree(private); private = NULL; } } } if (private) delalloc_cached_state = &private->llseek_cached_state; else delalloc_cached_state = NULL; /* * offset can be negative, in this case we start finding DATA/HOLE from * the very start of the file. */ start = max_t(loff_t, 0, offset); lockstart = round_down(start, fs_info->sectorsize); lockend = round_up(i_size, fs_info->sectorsize); if (lockend <= lockstart) lockend = lockstart + fs_info->sectorsize; lockend--; path = btrfs_alloc_path(); if (!path) return -ENOMEM; path->reada = READA_FORWARD; key.objectid = ino; key.type = BTRFS_EXTENT_DATA_KEY; key.offset = start; last_extent_end = lockstart; btrfs_lock_extent(&inode->io_tree, lockstart, lockend, &cached_state); ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) { goto out; } else if (ret > 0 && path->slots[0] > 0) { btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) path->slots[0]--; } while (start < i_size) { struct extent_buffer *leaf = path->nodes[0]; struct btrfs_file_extent_item *extent; u64 extent_end; u8 type; if (path->slots[0] >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); if (ret < 0) goto out; else if (ret > 0) break; leaf = path->nodes[0]; } btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) break; extent_end = btrfs_file_extent_end(path); /* * In the first iteration we may have a slot that points to an * extent that ends before our start offset, so skip it. */ if (extent_end <= start) { path->slots[0]++; continue; } /* We have an implicit hole, NO_HOLES feature is likely set. */ if (last_extent_end < key.offset) { u64 search_start = last_extent_end; u64 found_start; /* * First iteration, @start matches @offset and it's * within the hole. */ if (start == offset) search_start = offset; found = find_desired_extent_in_hole(inode, whence, delalloc_cached_state, search_start, key.offset - 1, &found_start); if (found) { start = found_start; break; } /* * Didn't find data or a hole (due to delalloc) in the * implicit hole range, so need to analyze the extent. */ } extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); type = btrfs_file_extent_type(leaf, extent); /* * Can't access the extent's disk_bytenr field if this is an * inline extent, since at that offset, it's where the extent * data starts. */ if (type == BTRFS_FILE_EXTENT_PREALLOC || (type == BTRFS_FILE_EXTENT_REG && btrfs_file_extent_disk_bytenr(leaf, extent) == 0)) { /* * Explicit hole or prealloc extent, search for delalloc. * A prealloc extent is treated like a hole. */ u64 search_start = key.offset; u64 found_start; /* * First iteration, @start matches @offset and it's * within the hole. */ if (start == offset) search_start = offset; found = find_desired_extent_in_hole(inode, whence, delalloc_cached_state, search_start, extent_end - 1, &found_start); if (found) { start = found_start; break; } /* * Didn't find data or a hole (due to delalloc) in the * implicit hole range, so need to analyze the next * extent item. */ } else { /* * Found a regular or inline extent. * If we are seeking for data, adjust the start offset * and stop, we're done. */ if (whence == SEEK_DATA) { start = max_t(u64, key.offset, offset); found = true; break; } /* * Else, we are seeking for a hole, check the next file * extent item. */ } start = extent_end; last_extent_end = extent_end; path->slots[0]++; if (fatal_signal_pending(current)) { ret = -EINTR; goto out; } cond_resched(); } /* We have an implicit hole from the last extent found up to i_size. */ if (!found && start < i_size) { found = find_desired_extent_in_hole(inode, whence, delalloc_cached_state, start, i_size - 1, &start); if (!found) start = i_size; } out: btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); btrfs_free_path(path); if (ret < 0) return ret; if (whence == SEEK_DATA && start >= i_size) return -ENXIO; return min_t(loff_t, start, i_size); } static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; switch (whence) { default: return generic_file_llseek(file, offset, whence); case SEEK_DATA: case SEEK_HOLE: btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); offset = find_desired_extent(file, offset, whence); btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); break; } if (offset < 0) return offset; return vfs_setpos(file, offset, inode->i_sb->s_maxbytes); } static int btrfs_file_open(struct inode *inode, struct file *filp) { int ret; filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; ret = fsverity_file_open(inode, filp); if (ret) return ret; return generic_file_open(inode, filp); } static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { ssize_t ret = 0; if (iocb->ki_flags & IOCB_DIRECT) { ret = btrfs_direct_read(iocb, to); if (ret < 0 || !iov_iter_count(to) || iocb->ki_pos >= i_size_read(file_inode(iocb->ki_filp))) return ret; } return filemap_read(iocb, to, ret); } const struct file_operations btrfs_file_operations = { .llseek = btrfs_file_llseek, .read_iter = btrfs_file_read_iter, .splice_read = filemap_splice_read, .write_iter = btrfs_file_write_iter, .splice_write = iter_file_splice_write, .mmap_prepare = btrfs_file_mmap_prepare, .open = btrfs_file_open, .release = btrfs_release_file, .get_unmapped_area = thp_get_unmapped_area, .fsync = btrfs_sync_file, .fallocate = btrfs_fallocate, .unlocked_ioctl = btrfs_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = btrfs_compat_ioctl, #endif .remap_file_range = btrfs_remap_file_range, .uring_cmd = btrfs_uring_cmd, .fop_flags = FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC, }; int btrfs_fdatawrite_range(struct btrfs_inode *inode, loff_t start, loff_t end) { struct address_space *mapping = inode->vfs_inode.i_mapping; int ret; /* * So with compression we will find and lock a dirty page and clear the * first one as dirty, setup an async extent, and immediately return * with the entire range locked but with nobody actually marked with * writeback. So we can't just filemap_write_and_wait_range() and * expect it to work since it will just kick off a thread to do the * actual work. So we need to call filemap_fdatawrite_range _again_ * since it will wait on the page lock, which won't be unlocked until * after the pages have been marked as writeback and so we're good to go * from there. We have to do this otherwise we'll miss the ordered * extents and that results in badness. Please Josef, do not think you * know better and pull this out at some point in the future, it is * right and you are wrong. */ ret = filemap_fdatawrite_range(mapping, start, end); if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags)) ret = filemap_fdatawrite_range(mapping, start, end); return ret; }
32 127 10 37 14 69 53 25 24 151 105 44 59 40 22 1424 1477 13 143 138 74 125 4 74 13 52 24 2 15 37 130 123 44 259 260 260 260 259 260 125 136 130 49 141 142 142 142 142 138 2 10 72 71 1 4 70 3 72 72 13 70 1 3 4 2 164 4 6 1 325 248 78 51 277 338 336 72 71 193 196 256 193 3 38 161 258 129 333 284 98 246 138 288 325 326 34 52 4 334 14 1 2 2 21 4 8 1 1 14 10 10 10 10 388 512 8 8 39 39 39 39 39 6 34 34 6 17 116 116 126 126 6 6 6 6 6 3 3 100 101 100 229 228 228 229 229 229 229 181 181 181 4 180 8 8 76 77 8 8 8 8 77 76 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Support for INET connection oriented protocols. * * Authors: See the TCP sources */ #include <linux/module.h> #include <linux/jhash.h> #include <net/inet_connection_sock.h> #include <net/inet_hashtables.h> #include <net/inet_timewait_sock.h> #include <net/ip.h> #include <net/route.h> #include <net/tcp_states.h> #include <net/xfrm.h> #include <net/tcp.h> #include <net/sock_reuseport.h> #include <net/addrconf.h> #if IS_ENABLED(CONFIG_IPV6) /* match_sk*_wildcard == true: IPV6_ADDR_ANY equals to any IPv6 addresses * if IPv6 only, and any IPv4 addresses * if not IPv6 only * match_sk*_wildcard == false: addresses must be exactly the same, i.e. * IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY, * and 0.0.0.0 equals to 0.0.0.0 only */ static bool ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6, const struct in6_addr *sk2_rcv_saddr6, __be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr, bool sk1_ipv6only, bool sk2_ipv6only, bool match_sk1_wildcard, bool match_sk2_wildcard) { int addr_type = ipv6_addr_type(sk1_rcv_saddr6); int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED; /* if both are mapped, treat as IPv4 */ if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) { if (!sk2_ipv6only) { if (sk1_rcv_saddr == sk2_rcv_saddr) return true; return (match_sk1_wildcard && !sk1_rcv_saddr) || (match_sk2_wildcard && !sk2_rcv_saddr); } return false; } if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY) return true; if (addr_type2 == IPV6_ADDR_ANY && match_sk2_wildcard && !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED)) return true; if (addr_type == IPV6_ADDR_ANY && match_sk1_wildcard && !(sk1_ipv6only && addr_type2 == IPV6_ADDR_MAPPED)) return true; if (sk2_rcv_saddr6 && ipv6_addr_equal(sk1_rcv_saddr6, sk2_rcv_saddr6)) return true; return false; } #endif /* match_sk*_wildcard == true: 0.0.0.0 equals to any IPv4 addresses * match_sk*_wildcard == false: addresses must be exactly the same, i.e. * 0.0.0.0 only equals to 0.0.0.0 */ static bool ipv4_rcv_saddr_equal(__be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr, bool sk2_ipv6only, bool match_sk1_wildcard, bool match_sk2_wildcard) { if (!sk2_ipv6only) { if (sk1_rcv_saddr == sk2_rcv_saddr) return true; return (match_sk1_wildcard && !sk1_rcv_saddr) || (match_sk2_wildcard && !sk2_rcv_saddr); } return false; } bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, bool match_wildcard) { #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) return ipv6_rcv_saddr_equal(&sk->sk_v6_rcv_saddr, inet6_rcv_saddr(sk2), sk->sk_rcv_saddr, sk2->sk_rcv_saddr, ipv6_only_sock(sk), ipv6_only_sock(sk2), match_wildcard, match_wildcard); #endif return ipv4_rcv_saddr_equal(sk->sk_rcv_saddr, sk2->sk_rcv_saddr, ipv6_only_sock(sk2), match_wildcard, match_wildcard); } EXPORT_SYMBOL(inet_rcv_saddr_equal); bool inet_rcv_saddr_any(const struct sock *sk) { #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) return ipv6_addr_any(&sk->sk_v6_rcv_saddr); #endif return !sk->sk_rcv_saddr; } /** * inet_sk_get_local_port_range - fetch ephemeral ports range * @sk: socket * @low: pointer to low port * @high: pointer to high port * * Fetch netns port range (/proc/sys/net/ipv4/ip_local_port_range) * Range can be overridden if socket got IP_LOCAL_PORT_RANGE option. * Returns true if IP_LOCAL_PORT_RANGE was set on this socket. */ bool inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high) { int lo, hi, sk_lo, sk_hi; bool local_range = false; u32 sk_range; inet_get_local_port_range(sock_net(sk), &lo, &hi); sk_range = READ_ONCE(inet_sk(sk)->local_port_range); if (unlikely(sk_range)) { sk_lo = sk_range & 0xffff; sk_hi = sk_range >> 16; if (lo <= sk_lo && sk_lo <= hi) lo = sk_lo; if (lo <= sk_hi && sk_hi <= hi) hi = sk_hi; local_range = true; } *low = lo; *high = hi; return local_range; } EXPORT_SYMBOL(inet_sk_get_local_port_range); static bool inet_use_bhash2_on_bind(const struct sock *sk) { #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) { if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) return false; if (!ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) return true; } #endif return sk->sk_rcv_saddr != htonl(INADDR_ANY); } static bool inet_bind_conflict(const struct sock *sk, struct sock *sk2, kuid_t uid, bool relax, bool reuseport_cb_ok, bool reuseport_ok) { int bound_dev_if2; if (sk == sk2) return false; bound_dev_if2 = READ_ONCE(sk2->sk_bound_dev_if); if (!sk->sk_bound_dev_if || !bound_dev_if2 || sk->sk_bound_dev_if == bound_dev_if2) { if (sk->sk_reuse && sk2->sk_reuse && sk2->sk_state != TCP_LISTEN) { if (!relax || (!reuseport_ok && sk->sk_reuseport && sk2->sk_reuseport && reuseport_cb_ok && (sk2->sk_state == TCP_TIME_WAIT || uid_eq(uid, sk_uid(sk2))))) return true; } else if (!reuseport_ok || !sk->sk_reuseport || !sk2->sk_reuseport || !reuseport_cb_ok || (sk2->sk_state != TCP_TIME_WAIT && !uid_eq(uid, sk_uid(sk2)))) { return true; } } return false; } static bool __inet_bhash2_conflict(const struct sock *sk, struct sock *sk2, kuid_t uid, bool relax, bool reuseport_cb_ok, bool reuseport_ok) { if (ipv6_only_sock(sk2)) { if (sk->sk_family == AF_INET) return false; #if IS_ENABLED(CONFIG_IPV6) if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) return false; #endif } return inet_bind_conflict(sk, sk2, uid, relax, reuseport_cb_ok, reuseport_ok); } static bool inet_bhash2_conflict(const struct sock *sk, const struct inet_bind2_bucket *tb2, kuid_t uid, bool relax, bool reuseport_cb_ok, bool reuseport_ok) { struct sock *sk2; sk_for_each_bound(sk2, &tb2->owners) { if (__inet_bhash2_conflict(sk, sk2, uid, relax, reuseport_cb_ok, reuseport_ok)) return true; } return false; } #define sk_for_each_bound_bhash(__sk, __tb2, __tb) \ hlist_for_each_entry(__tb2, &(__tb)->bhash2, bhash_node) \ sk_for_each_bound((__sk), &(__tb2)->owners) /* This should be called only when the tb and tb2 hashbuckets' locks are held */ static int inet_csk_bind_conflict(const struct sock *sk, const struct inet_bind_bucket *tb, const struct inet_bind2_bucket *tb2, /* may be null */ bool relax, bool reuseport_ok) { struct sock_reuseport *reuseport_cb; kuid_t uid = sk_uid(sk); bool reuseport_cb_ok; struct sock *sk2; rcu_read_lock(); reuseport_cb = rcu_dereference(sk->sk_reuseport_cb); /* paired with WRITE_ONCE() in __reuseport_(add|detach)_closed_sock */ reuseport_cb_ok = !reuseport_cb || READ_ONCE(reuseport_cb->num_closed_socks); rcu_read_unlock(); /* Conflicts with an existing IPV6_ADDR_ANY (if ipv6) or INADDR_ANY (if * ipv4) should have been checked already. We need to do these two * checks separately because their spinlocks have to be acquired/released * independently of each other, to prevent possible deadlocks */ if (inet_use_bhash2_on_bind(sk)) return tb2 && inet_bhash2_conflict(sk, tb2, uid, relax, reuseport_cb_ok, reuseport_ok); /* Unlike other sk lookup places we do not check * for sk_net here, since _all_ the socks listed * in tb->owners and tb2->owners list belong * to the same net - the one this bucket belongs to. */ sk_for_each_bound_bhash(sk2, tb2, tb) { if (!inet_bind_conflict(sk, sk2, uid, relax, reuseport_cb_ok, reuseport_ok)) continue; if (inet_rcv_saddr_equal(sk, sk2, true)) return true; } return false; } /* Determine if there is a bind conflict with an existing IPV6_ADDR_ANY (if ipv6) or * INADDR_ANY (if ipv4) socket. * * Caller must hold bhash hashbucket lock with local bh disabled, to protect * against concurrent binds on the port for addr any */ static bool inet_bhash2_addr_any_conflict(const struct sock *sk, int port, int l3mdev, bool relax, bool reuseport_ok) { const struct net *net = sock_net(sk); struct sock_reuseport *reuseport_cb; struct inet_bind_hashbucket *head2; struct inet_bind2_bucket *tb2; kuid_t uid = sk_uid(sk); bool conflict = false; bool reuseport_cb_ok; rcu_read_lock(); reuseport_cb = rcu_dereference(sk->sk_reuseport_cb); /* paired with WRITE_ONCE() in __reuseport_(add|detach)_closed_sock */ reuseport_cb_ok = !reuseport_cb || READ_ONCE(reuseport_cb->num_closed_socks); rcu_read_unlock(); head2 = inet_bhash2_addr_any_hashbucket(sk, net, port); spin_lock(&head2->lock); inet_bind_bucket_for_each(tb2, &head2->chain) { if (!inet_bind2_bucket_match_addr_any(tb2, net, port, l3mdev, sk)) continue; if (!inet_bhash2_conflict(sk, tb2, uid, relax, reuseport_cb_ok, reuseport_ok)) continue; conflict = true; break; } spin_unlock(&head2->lock); return conflict; } /* * Find an open port number for the socket. Returns with the * inet_bind_hashbucket locks held if successful. */ static struct inet_bind_hashbucket * inet_csk_find_open_port(const struct sock *sk, struct inet_bind_bucket **tb_ret, struct inet_bind2_bucket **tb2_ret, struct inet_bind_hashbucket **head2_ret, int *port_ret) { struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk); int i, low, high, attempt_half, port, l3mdev; struct inet_bind_hashbucket *head, *head2; struct net *net = sock_net(sk); struct inet_bind2_bucket *tb2; struct inet_bind_bucket *tb; u32 remaining, offset; bool relax = false; l3mdev = inet_sk_bound_l3mdev(sk); ports_exhausted: attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0; other_half_scan: inet_sk_get_local_port_range(sk, &low, &high); high++; /* [32768, 60999] -> [32768, 61000[ */ if (high - low < 4) attempt_half = 0; if (attempt_half) { int half = low + (((high - low) >> 2) << 1); if (attempt_half == 1) high = half; else low = half; } remaining = high - low; if (likely(remaining > 1)) remaining &= ~1U; offset = get_random_u32_below(remaining); /* __inet_hash_connect() favors ports having @low parity * We do the opposite to not pollute connect() users. */ offset |= 1U; other_parity_scan: port = low + offset; for (i = 0; i < remaining; i += 2, port += 2) { if (unlikely(port >= high)) port -= remaining; if (inet_is_local_reserved_port(net, port)) continue; head = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)]; spin_lock_bh(&head->lock); if (inet_use_bhash2_on_bind(sk)) { if (inet_bhash2_addr_any_conflict(sk, port, l3mdev, relax, false)) goto next_port; } head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); spin_lock(&head2->lock); tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); inet_bind_bucket_for_each(tb, &head->chain) if (inet_bind_bucket_match(tb, net, port, l3mdev)) { if (!inet_csk_bind_conflict(sk, tb, tb2, relax, false)) goto success; spin_unlock(&head2->lock); goto next_port; } tb = NULL; goto success; next_port: spin_unlock_bh(&head->lock); cond_resched(); } offset--; if (!(offset & 1)) goto other_parity_scan; if (attempt_half == 1) { /* OK we now try the upper half of the range */ attempt_half = 2; goto other_half_scan; } if (READ_ONCE(net->ipv4.sysctl_ip_autobind_reuse) && !relax) { /* We still have a chance to connect to different destinations */ relax = true; goto ports_exhausted; } return NULL; success: *port_ret = port; *tb_ret = tb; *tb2_ret = tb2; *head2_ret = head2; return head; } static inline int sk_reuseport_match(struct inet_bind_bucket *tb, const struct sock *sk) { if (tb->fastreuseport <= 0) return 0; if (!sk->sk_reuseport) return 0; if (rcu_access_pointer(sk->sk_reuseport_cb)) return 0; if (!uid_eq(tb->fastuid, sk_uid(sk))) return 0; /* We only need to check the rcv_saddr if this tb was once marked * without fastreuseport and then was reset, as we can only know that * the fast_*rcv_saddr doesn't have any conflicts with the socks on the * owners list. */ if (tb->fastreuseport == FASTREUSEPORT_ANY) return 1; #if IS_ENABLED(CONFIG_IPV6) if (tb->fast_sk_family == AF_INET6) return ipv6_rcv_saddr_equal(&tb->fast_v6_rcv_saddr, inet6_rcv_saddr(sk), tb->fast_rcv_saddr, sk->sk_rcv_saddr, tb->fast_ipv6_only, ipv6_only_sock(sk), true, false); #endif return ipv4_rcv_saddr_equal(tb->fast_rcv_saddr, sk->sk_rcv_saddr, ipv6_only_sock(sk), true, false); } void inet_csk_update_fastreuse(const struct sock *sk, struct inet_bind_bucket *tb, struct inet_bind2_bucket *tb2) { bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; if (hlist_empty(&tb->bhash2)) { tb->fastreuse = reuse; if (sk->sk_reuseport) { tb->fastreuseport = FASTREUSEPORT_ANY; tb->fastuid = sk_uid(sk); tb->fast_rcv_saddr = sk->sk_rcv_saddr; tb->fast_ipv6_only = ipv6_only_sock(sk); tb->fast_sk_family = sk->sk_family; #if IS_ENABLED(CONFIG_IPV6) tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr; #endif } else { tb->fastreuseport = 0; } } else { if (!reuse) tb->fastreuse = 0; if (sk->sk_reuseport) { /* We didn't match or we don't have fastreuseport set on * the tb, but we have sk_reuseport set on this socket * and we know that there are no bind conflicts with * this socket in this tb, so reset our tb's reuseport * settings so that any subsequent sockets that match * our current socket will be put on the fast path. * * If we reset we need to set FASTREUSEPORT_STRICT so we * do extra checking for all subsequent sk_reuseport * socks. */ if (!sk_reuseport_match(tb, sk)) { tb->fastreuseport = FASTREUSEPORT_STRICT; tb->fastuid = sk_uid(sk); tb->fast_rcv_saddr = sk->sk_rcv_saddr; tb->fast_ipv6_only = ipv6_only_sock(sk); tb->fast_sk_family = sk->sk_family; #if IS_ENABLED(CONFIG_IPV6) tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr; #endif } } else { tb->fastreuseport = 0; } } tb2->fastreuse = tb->fastreuse; tb2->fastreuseport = tb->fastreuseport; } /* Obtain a reference to a local port for the given sock, * if snum is zero it means select any available local port. * We try to allocate an odd port (and leave even ports for connect()) */ int inet_csk_get_port(struct sock *sk, unsigned short snum) { bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; bool found_port = false, check_bind_conflict = true; bool bhash_created = false, bhash2_created = false; struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk); int ret = -EADDRINUSE, port = snum, l3mdev; struct inet_bind_hashbucket *head, *head2; struct inet_bind2_bucket *tb2 = NULL; struct inet_bind_bucket *tb = NULL; bool head2_lock_acquired = false; struct net *net = sock_net(sk); l3mdev = inet_sk_bound_l3mdev(sk); if (!port) { head = inet_csk_find_open_port(sk, &tb, &tb2, &head2, &port); if (!head) return ret; head2_lock_acquired = true; if (tb && tb2) goto success; found_port = true; } else { head = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)]; spin_lock_bh(&head->lock); inet_bind_bucket_for_each(tb, &head->chain) if (inet_bind_bucket_match(tb, net, port, l3mdev)) break; } if (!tb) { tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, net, head, port, l3mdev); if (!tb) goto fail_unlock; bhash_created = true; } if (!found_port) { if (!hlist_empty(&tb->bhash2)) { if (sk->sk_reuse == SK_FORCE_REUSE || (tb->fastreuse > 0 && reuse) || sk_reuseport_match(tb, sk)) check_bind_conflict = false; } if (check_bind_conflict && inet_use_bhash2_on_bind(sk)) { if (inet_bhash2_addr_any_conflict(sk, port, l3mdev, true, true)) goto fail_unlock; } head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); spin_lock(&head2->lock); head2_lock_acquired = true; tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); } if (!tb2) { tb2 = inet_bind2_bucket_create(hinfo->bind2_bucket_cachep, net, head2, tb, sk); if (!tb2) goto fail_unlock; bhash2_created = true; } if (!found_port && check_bind_conflict) { if (inet_csk_bind_conflict(sk, tb, tb2, true, true)) goto fail_unlock; } success: inet_csk_update_fastreuse(sk, tb, tb2); if (!inet_csk(sk)->icsk_bind_hash) inet_bind_hash(sk, tb, tb2, port); WARN_ON(inet_csk(sk)->icsk_bind_hash != tb); WARN_ON(inet_csk(sk)->icsk_bind2_hash != tb2); ret = 0; fail_unlock: if (ret) { if (bhash2_created) inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, tb2); if (bhash_created) inet_bind_bucket_destroy(tb); } if (head2_lock_acquired) spin_unlock(&head2->lock); spin_unlock_bh(&head->lock); return ret; } EXPORT_SYMBOL_GPL(inet_csk_get_port); /* * Wait for an incoming connection, avoid race conditions. This must be called * with the socket locked. */ static int inet_csk_wait_for_connect(struct sock *sk, long timeo) { struct inet_connection_sock *icsk = inet_csk(sk); DEFINE_WAIT(wait); int err; /* * True wake-one mechanism for incoming connections: only * one process gets woken up, not the 'whole herd'. * Since we do not 'race & poll' for established sockets * anymore, the common case will execute the loop only once. * * Subtle issue: "add_wait_queue_exclusive()" will be added * after any current non-exclusive waiters, and we know that * it will always _stay_ after any new non-exclusive waiters * because all non-exclusive waiters are added at the * beginning of the wait-queue. As such, it's ok to "drop" * our exclusiveness temporarily when we get woken up without * having to remove and re-insert us on the wait queue. */ for (;;) { prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); release_sock(sk); if (reqsk_queue_empty(&icsk->icsk_accept_queue)) timeo = schedule_timeout(timeo); sched_annotate_sleep(); lock_sock(sk); err = 0; if (!reqsk_queue_empty(&icsk->icsk_accept_queue)) break; err = -EINVAL; if (sk->sk_state != TCP_LISTEN) break; err = sock_intr_errno(timeo); if (signal_pending(current)) break; err = -EAGAIN; if (!timeo) break; } finish_wait(sk_sleep(sk), &wait); return err; } /* * This will accept the next outstanding connection. */ struct sock *inet_csk_accept(struct sock *sk, struct proto_accept_arg *arg) { struct inet_connection_sock *icsk = inet_csk(sk); struct request_sock_queue *queue = &icsk->icsk_accept_queue; struct request_sock *req; struct sock *newsk; int error; lock_sock(sk); /* We need to make sure that this socket is listening, * and that it has something pending. */ error = -EINVAL; if (sk->sk_state != TCP_LISTEN) goto out_err; /* Find already established connection */ if (reqsk_queue_empty(queue)) { long timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK); /* If this is a non blocking socket don't sleep */ error = -EAGAIN; if (!timeo) goto out_err; error = inet_csk_wait_for_connect(sk, timeo); if (error) goto out_err; } req = reqsk_queue_remove(queue, sk); arg->is_empty = reqsk_queue_empty(queue); newsk = req->sk; if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) { spin_lock_bh(&queue->fastopenq.lock); if (tcp_rsk(req)->tfo_listener) { /* We are still waiting for the final ACK from 3WHS * so can't free req now. Instead, we set req->sk to * NULL to signify that the child socket is taken * so reqsk_fastopen_remove() will free the req * when 3WHS finishes (or is aborted). */ req->sk = NULL; req = NULL; } spin_unlock_bh(&queue->fastopenq.lock); } release_sock(sk); if (mem_cgroup_sockets_enabled) { gfp_t gfp = GFP_KERNEL | __GFP_NOFAIL; int amt = 0; /* atomically get the memory usage, set and charge the * newsk->sk_memcg. */ lock_sock(newsk); mem_cgroup_sk_alloc(newsk); if (mem_cgroup_from_sk(newsk)) { /* The socket has not been accepted yet, no need * to look at newsk->sk_wmem_queued. */ amt = sk_mem_pages(newsk->sk_forward_alloc + atomic_read(&newsk->sk_rmem_alloc)); } if (amt) mem_cgroup_sk_charge(newsk, amt, gfp); kmem_cache_charge(newsk, gfp); release_sock(newsk); } if (req) reqsk_put(req); inet_init_csk_locks(newsk); return newsk; out_err: release_sock(sk); arg->err = error; return NULL; } EXPORT_SYMBOL(inet_csk_accept); /* * Using different timers for retransmit, delayed acks and probes * We may wish use just one timer maintaining a list of expire jiffies * to optimize. */ void inet_csk_init_xmit_timers(struct sock *sk, void (*retransmit_handler)(struct timer_list *t), void (*delack_handler)(struct timer_list *t), void (*keepalive_handler)(struct timer_list *t)) { struct inet_connection_sock *icsk = inet_csk(sk); timer_setup(&icsk->icsk_retransmit_timer, retransmit_handler, 0); timer_setup(&icsk->icsk_delack_timer, delack_handler, 0); timer_setup(&sk->sk_timer, keepalive_handler, 0); icsk->icsk_pending = icsk->icsk_ack.pending = 0; } void inet_csk_clear_xmit_timers(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); smp_store_release(&icsk->icsk_pending, 0); smp_store_release(&icsk->icsk_ack.pending, 0); sk_stop_timer(sk, &icsk->icsk_retransmit_timer); sk_stop_timer(sk, &icsk->icsk_delack_timer); sk_stop_timer(sk, &sk->sk_timer); } void inet_csk_clear_xmit_timers_sync(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); /* ongoing timer handlers need to acquire socket lock. */ sock_not_owned_by_me(sk); smp_store_release(&icsk->icsk_pending, 0); smp_store_release(&icsk->icsk_ack.pending, 0); sk_stop_timer_sync(sk, &icsk->icsk_retransmit_timer); sk_stop_timer_sync(sk, &icsk->icsk_delack_timer); sk_stop_timer_sync(sk, &sk->sk_timer); } struct dst_entry *inet_csk_route_req(const struct sock *sk, struct flowi4 *fl4, const struct request_sock *req) { const struct inet_request_sock *ireq = inet_rsk(req); struct net *net = read_pnet(&ireq->ireq_net); struct ip_options_rcu *opt; struct rtable *rt; rcu_read_lock(); opt = rcu_dereference(ireq->ireq_opt); flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark, ip_sock_rt_tos(sk), ip_sock_rt_scope(sk), sk->sk_protocol, inet_sk_flowi_flags(sk), (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, ireq->ir_loc_addr, ireq->ir_rmt_port, htons(ireq->ir_num), sk_uid(sk)); security_req_classify_flow(req, flowi4_to_flowi_common(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) goto no_route; if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) goto route_err; rcu_read_unlock(); return &rt->dst; route_err: ip_rt_put(rt); no_route: rcu_read_unlock(); __IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); return NULL; } struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, struct sock *newsk, const struct request_sock *req) { const struct inet_request_sock *ireq = inet_rsk(req); struct net *net = read_pnet(&ireq->ireq_net); struct inet_sock *newinet = inet_sk(newsk); struct ip_options_rcu *opt; struct flowi4 *fl4; struct rtable *rt; opt = rcu_dereference(ireq->ireq_opt); fl4 = &newinet->cork.fl.u.ip4; flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark, ip_sock_rt_tos(sk), ip_sock_rt_scope(sk), sk->sk_protocol, inet_sk_flowi_flags(sk), (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, ireq->ir_loc_addr, ireq->ir_rmt_port, htons(ireq->ir_num), sk_uid(sk)); security_req_classify_flow(req, flowi4_to_flowi_common(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) goto no_route; if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) goto route_err; return &rt->dst; route_err: ip_rt_put(rt); no_route: __IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); return NULL; } EXPORT_SYMBOL_GPL(inet_csk_route_child_sock); /* Decide when to expire the request and when to resend SYN-ACK */ static void syn_ack_recalc(struct request_sock *req, const int max_syn_ack_retries, const u8 rskq_defer_accept, int *expire, int *resend) { if (!rskq_defer_accept) { *expire = req->num_timeout >= max_syn_ack_retries; *resend = 1; return; } *expire = req->num_timeout >= max_syn_ack_retries && (!inet_rsk(req)->acked || req->num_timeout >= rskq_defer_accept); /* Do not resend while waiting for data after ACK, * start to resend on end of deferring period to give * last chance for data or ACK to create established socket. */ *resend = !inet_rsk(req)->acked || req->num_timeout >= rskq_defer_accept - 1; } static struct request_sock * reqsk_alloc_noprof(const struct request_sock_ops *ops, struct sock *sk_listener, bool attach_listener) { struct request_sock *req; req = kmem_cache_alloc_noprof(ops->slab, GFP_ATOMIC | __GFP_NOWARN); if (!req) return NULL; req->rsk_listener = NULL; if (attach_listener) { if (unlikely(!refcount_inc_not_zero(&sk_listener->sk_refcnt))) { kmem_cache_free(ops->slab, req); return NULL; } req->rsk_listener = sk_listener; } req->rsk_ops = ops; req_to_sk(req)->sk_prot = sk_listener->sk_prot; sk_node_init(&req_to_sk(req)->sk_node); sk_tx_queue_clear(req_to_sk(req)); req->saved_syn = NULL; req->syncookie = 0; req->timeout = 0; req->num_timeout = 0; req->num_retrans = 0; req->sk = NULL; refcount_set(&req->rsk_refcnt, 0); return req; } #define reqsk_alloc(...) alloc_hooks(reqsk_alloc_noprof(__VA_ARGS__)) struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener, bool attach_listener) { struct request_sock *req = reqsk_alloc(ops, sk_listener, attach_listener); if (req) { struct inet_request_sock *ireq = inet_rsk(req); ireq->ireq_opt = NULL; #if IS_ENABLED(CONFIG_IPV6) ireq->pktopts = NULL; #endif atomic64_set(&ireq->ir_cookie, 0); ireq->ireq_state = TCP_NEW_SYN_RECV; write_pnet(&ireq->ireq_net, sock_net(sk_listener)); ireq->ireq_family = sk_listener->sk_family; req->timeout = TCP_TIMEOUT_INIT; } return req; } EXPORT_SYMBOL(inet_reqsk_alloc); static struct request_sock *inet_reqsk_clone(struct request_sock *req, struct sock *sk) { struct sock *req_sk, *nreq_sk; struct request_sock *nreq; nreq = kmem_cache_alloc(req->rsk_ops->slab, GFP_ATOMIC | __GFP_NOWARN); if (!nreq) { __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE); /* paired with refcount_inc_not_zero() in reuseport_migrate_sock() */ sock_put(sk); return NULL; } req_sk = req_to_sk(req); nreq_sk = req_to_sk(nreq); memcpy(nreq_sk, req_sk, offsetof(struct sock, sk_dontcopy_begin)); unsafe_memcpy(&nreq_sk->sk_dontcopy_end, &req_sk->sk_dontcopy_end, req->rsk_ops->obj_size - offsetof(struct sock, sk_dontcopy_end), /* alloc is larger than struct, see above */); sk_node_init(&nreq_sk->sk_node); nreq_sk->sk_tx_queue_mapping = req_sk->sk_tx_queue_mapping; #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING nreq_sk->sk_rx_queue_mapping = req_sk->sk_rx_queue_mapping; #endif nreq_sk->sk_incoming_cpu = req_sk->sk_incoming_cpu; nreq->rsk_listener = sk; /* We need not acquire fastopenq->lock * because the child socket is locked in inet_csk_listen_stop(). */ if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(nreq)->tfo_listener) rcu_assign_pointer(tcp_sk(nreq->sk)->fastopen_rsk, nreq); return nreq; } static void reqsk_queue_migrated(struct request_sock_queue *queue, const struct request_sock *req) { if (req->num_timeout == 0) atomic_inc(&queue->young); atomic_inc(&queue->qlen); } static void reqsk_migrate_reset(struct request_sock *req) { req->saved_syn = NULL; #if IS_ENABLED(CONFIG_IPV6) inet_rsk(req)->ipv6_opt = NULL; inet_rsk(req)->pktopts = NULL; #else inet_rsk(req)->ireq_opt = NULL; #endif } /* return true if req was found in the ehash table */ static bool reqsk_queue_unlink(struct request_sock *req) { struct sock *sk = req_to_sk(req); bool found = false; if (sk_hashed(sk)) { struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); spinlock_t *lock; lock = inet_ehash_lockp(hashinfo, req->rsk_hash); spin_lock(lock); found = __sk_nulls_del_node_init_rcu(sk); spin_unlock(lock); } return found; } static bool __inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req, bool from_timer) { bool unlinked = reqsk_queue_unlink(req); if (!from_timer && timer_delete_sync(&req->rsk_timer)) reqsk_put(req); if (unlinked) { reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req); reqsk_put(req); } return unlinked; } bool inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req) { return __inet_csk_reqsk_queue_drop(sk, req, false); } void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req) { inet_csk_reqsk_queue_drop(sk, req); reqsk_put(req); } EXPORT_IPV6_MOD(inet_csk_reqsk_queue_drop_and_put); static void reqsk_timer_handler(struct timer_list *t) { struct request_sock *req = timer_container_of(req, t, rsk_timer); struct request_sock *nreq = NULL, *oreq = req; struct sock *sk_listener = req->rsk_listener; struct inet_connection_sock *icsk; struct request_sock_queue *queue; struct net *net; int max_syn_ack_retries, qlen, expire = 0, resend = 0; if (inet_sk_state_load(sk_listener) != TCP_LISTEN) { struct sock *nsk; nsk = reuseport_migrate_sock(sk_listener, req_to_sk(req), NULL); if (!nsk) goto drop; nreq = inet_reqsk_clone(req, nsk); if (!nreq) goto drop; /* The new timer for the cloned req can decrease the 2 * by calling inet_csk_reqsk_queue_drop_and_put(), so * hold another count to prevent use-after-free and * call reqsk_put() just before return. */ refcount_set(&nreq->rsk_refcnt, 2 + 1); timer_setup(&nreq->rsk_timer, reqsk_timer_handler, TIMER_PINNED); reqsk_queue_migrated(&inet_csk(nsk)->icsk_accept_queue, req); req = nreq; sk_listener = nsk; } icsk = inet_csk(sk_listener); net = sock_net(sk_listener); max_syn_ack_retries = READ_ONCE(icsk->icsk_syn_retries) ? : READ_ONCE(net->ipv4.sysctl_tcp_synack_retries); /* Normally all the openreqs are young and become mature * (i.e. converted to established socket) for first timeout. * If synack was not acknowledged for 1 second, it means * one of the following things: synack was lost, ack was lost, * rtt is high or nobody planned to ack (i.e. synflood). * When server is a bit loaded, queue is populated with old * open requests, reducing effective size of queue. * When server is well loaded, queue size reduces to zero * after several minutes of work. It is not synflood, * it is normal operation. The solution is pruning * too old entries overriding normal timeout, when * situation becomes dangerous. * * Essentially, we reserve half of room for young * embrions; and abort old ones without pity, if old * ones are about to clog our table. */ queue = &icsk->icsk_accept_queue; qlen = reqsk_queue_len(queue); if ((qlen << 1) > max(8U, READ_ONCE(sk_listener->sk_max_ack_backlog))) { int young = reqsk_queue_len_young(queue) << 1; while (max_syn_ack_retries > 2) { if (qlen < young) break; max_syn_ack_retries--; young <<= 1; } } syn_ack_recalc(req, max_syn_ack_retries, READ_ONCE(queue->rskq_defer_accept), &expire, &resend); req->rsk_ops->syn_ack_timeout(req); if (!expire && (!resend || !tcp_rtx_synack(sk_listener, req) || inet_rsk(req)->acked)) { if (req->num_timeout++ == 0) atomic_dec(&queue->young); mod_timer(&req->rsk_timer, jiffies + reqsk_timeout(req, TCP_RTO_MAX)); if (!nreq) return; if (!inet_ehash_insert(req_to_sk(nreq), req_to_sk(oreq), NULL)) { /* delete timer */ __inet_csk_reqsk_queue_drop(sk_listener, nreq, true); goto no_ownership; } __NET_INC_STATS(net, LINUX_MIB_TCPMIGRATEREQSUCCESS); reqsk_migrate_reset(oreq); reqsk_queue_removed(&inet_csk(oreq->rsk_listener)->icsk_accept_queue, oreq); reqsk_put(oreq); reqsk_put(nreq); return; } /* Even if we can clone the req, we may need not retransmit any more * SYN+ACKs (nreq->num_timeout > max_syn_ack_retries, etc), or another * CPU may win the "own_req" race so that inet_ehash_insert() fails. */ if (nreq) { __NET_INC_STATS(net, LINUX_MIB_TCPMIGRATEREQFAILURE); no_ownership: reqsk_migrate_reset(nreq); reqsk_queue_removed(queue, nreq); __reqsk_free(nreq); } drop: __inet_csk_reqsk_queue_drop(sk_listener, oreq, true); reqsk_put(oreq); } static bool reqsk_queue_hash_req(struct request_sock *req, unsigned long timeout) { bool found_dup_sk = false; if (!inet_ehash_insert(req_to_sk(req), NULL, &found_dup_sk)) return false; /* The timer needs to be setup after a successful insertion. */ timer_setup(&req->rsk_timer, reqsk_timer_handler, TIMER_PINNED); mod_timer(&req->rsk_timer, jiffies + timeout); /* before letting lookups find us, make sure all req fields * are committed to memory and refcnt initialized. */ smp_wmb(); refcount_set(&req->rsk_refcnt, 2 + 1); return true; } bool inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, unsigned long timeout) { if (!reqsk_queue_hash_req(req, timeout)) return false; inet_csk_reqsk_queue_added(sk); return true; } static void inet_clone_ulp(const struct request_sock *req, struct sock *newsk, const gfp_t priority) { struct inet_connection_sock *icsk = inet_csk(newsk); if (!icsk->icsk_ulp_ops) return; icsk->icsk_ulp_ops->clone(req, newsk, priority); } /** * inet_csk_clone_lock - clone an inet socket, and lock its clone * @sk: the socket to clone * @req: request_sock * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) * * Caller must unlock socket even in error path (bh_unlock_sock(newsk)) */ struct sock *inet_csk_clone_lock(const struct sock *sk, const struct request_sock *req, const gfp_t priority) { struct sock *newsk = sk_clone_lock(sk, priority); struct inet_connection_sock *newicsk; struct inet_request_sock *ireq; struct inet_sock *newinet; if (!newsk) return NULL; newicsk = inet_csk(newsk); newinet = inet_sk(newsk); ireq = inet_rsk(req); newicsk->icsk_bind_hash = NULL; newicsk->icsk_bind2_hash = NULL; newinet->inet_dport = ireq->ir_rmt_port; newinet->inet_num = ireq->ir_num; newinet->inet_sport = htons(ireq->ir_num); newsk->sk_bound_dev_if = ireq->ir_iif; newsk->sk_daddr = ireq->ir_rmt_addr; newsk->sk_rcv_saddr = ireq->ir_loc_addr; newinet->inet_saddr = ireq->ir_loc_addr; #if IS_ENABLED(CONFIG_IPV6) newsk->sk_v6_daddr = ireq->ir_v6_rmt_addr; newsk->sk_v6_rcv_saddr = ireq->ir_v6_loc_addr; #endif /* listeners have SOCK_RCU_FREE, not the children */ sock_reset_flag(newsk, SOCK_RCU_FREE); inet_sk(newsk)->mc_list = NULL; newsk->sk_mark = inet_rsk(req)->ir_mark; atomic64_set(&newsk->sk_cookie, atomic64_read(&inet_rsk(req)->ir_cookie)); newicsk->icsk_retransmits = 0; newicsk->icsk_backoff = 0; newicsk->icsk_probes_out = 0; newicsk->icsk_probes_tstamp = 0; /* Deinitialize accept_queue to trap illegal accesses. */ memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue)); inet_sk_set_state(newsk, TCP_SYN_RECV); inet_clone_ulp(req, newsk, priority); security_inet_csk_clone(newsk, req); return newsk; } /* * At this point, there should be no process reference to this * socket, and thus no user references at all. Therefore we * can assume the socket waitqueue is inactive and nobody will * try to jump onto it. */ void inet_csk_destroy_sock(struct sock *sk) { WARN_ON(sk->sk_state != TCP_CLOSE); WARN_ON(!sock_flag(sk, SOCK_DEAD)); /* It cannot be in hash table! */ WARN_ON(!sk_unhashed(sk)); /* If it has not 0 inet_sk(sk)->inet_num, it must be bound */ WARN_ON(inet_sk(sk)->inet_num && !inet_csk(sk)->icsk_bind_hash); sk->sk_prot->destroy(sk); sk_stream_kill_queues(sk); xfrm_sk_free_policy(sk); tcp_orphan_count_dec(); sock_put(sk); } EXPORT_SYMBOL(inet_csk_destroy_sock); void inet_csk_prepare_for_destroy_sock(struct sock *sk) { /* The below has to be done to allow calling inet_csk_destroy_sock */ sock_set_flag(sk, SOCK_DEAD); tcp_orphan_count_inc(); } /* This function allows to force a closure of a socket after the call to * tcp_create_openreq_child(). */ void inet_csk_prepare_forced_close(struct sock *sk) __releases(&sk->sk_lock.slock) { /* sk_clone_lock locked the socket and set refcnt to 2 */ bh_unlock_sock(sk); sock_put(sk); inet_csk_prepare_for_destroy_sock(sk); inet_sk(sk)->inet_num = 0; } EXPORT_SYMBOL(inet_csk_prepare_forced_close); static int inet_ulp_can_listen(const struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); if (icsk->icsk_ulp_ops && !icsk->icsk_ulp_ops->clone) return -EINVAL; return 0; } int inet_csk_listen_start(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet = inet_sk(sk); int err; err = inet_ulp_can_listen(sk); if (unlikely(err)) return err; reqsk_queue_alloc(&icsk->icsk_accept_queue); sk->sk_ack_backlog = 0; inet_csk_delack_init(sk); /* There is race window here: we announce ourselves listening, * but this transition is still not validated by get_port(). * It is OK, because this socket enters to hash table only * after validation is complete. */ inet_sk_state_store(sk, TCP_LISTEN); err = sk->sk_prot->get_port(sk, inet->inet_num); if (!err) { inet->inet_sport = htons(inet->inet_num); sk_dst_reset(sk); err = sk->sk_prot->hash(sk); if (likely(!err)) return 0; } inet_sk_set_state(sk, TCP_CLOSE); return err; } static void inet_child_forget(struct sock *sk, struct request_sock *req, struct sock *child) { sk->sk_prot->disconnect(child, O_NONBLOCK); sock_orphan(child); tcp_orphan_count_inc(); if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) { BUG_ON(rcu_access_pointer(tcp_sk(child)->fastopen_rsk) != req); BUG_ON(sk != req->rsk_listener); /* Paranoid, to prevent race condition if * an inbound pkt destined for child is * blocked by sock lock in tcp_v4_rcv(). * Also to satisfy an assertion in * tcp_v4_destroy_sock(). */ RCU_INIT_POINTER(tcp_sk(child)->fastopen_rsk, NULL); } inet_csk_destroy_sock(child); } struct sock *inet_csk_reqsk_queue_add(struct sock *sk, struct request_sock *req, struct sock *child) { struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; spin_lock(&queue->rskq_lock); if (unlikely(sk->sk_state != TCP_LISTEN)) { inet_child_forget(sk, req, child); child = NULL; } else { req->sk = child; req->dl_next = NULL; if (queue->rskq_accept_head == NULL) WRITE_ONCE(queue->rskq_accept_head, req); else queue->rskq_accept_tail->dl_next = req; queue->rskq_accept_tail = req; sk_acceptq_added(sk); } spin_unlock(&queue->rskq_lock); return child; } EXPORT_SYMBOL(inet_csk_reqsk_queue_add); struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child, struct request_sock *req, bool own_req) { if (own_req) { inet_csk_reqsk_queue_drop(req->rsk_listener, req); reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req); if (sk != req->rsk_listener) { /* another listening sk has been selected, * migrate the req to it. */ struct request_sock *nreq; /* hold a refcnt for the nreq->rsk_listener * which is assigned in inet_reqsk_clone() */ sock_hold(sk); nreq = inet_reqsk_clone(req, sk); if (!nreq) { inet_child_forget(sk, req, child); goto child_put; } refcount_set(&nreq->rsk_refcnt, 1); if (inet_csk_reqsk_queue_add(sk, nreq, child)) { __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQSUCCESS); reqsk_migrate_reset(req); reqsk_put(req); return child; } __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE); reqsk_migrate_reset(nreq); __reqsk_free(nreq); } else if (inet_csk_reqsk_queue_add(sk, req, child)) { return child; } } /* Too bad, another child took ownership of the request, undo. */ child_put: bh_unlock_sock(child); sock_put(child); return NULL; } /* * This routine closes sockets which have been at least partially * opened, but not yet accepted. */ void inet_csk_listen_stop(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct request_sock_queue *queue = &icsk->icsk_accept_queue; struct request_sock *next, *req; /* Following specs, it would be better either to send FIN * (and enter FIN-WAIT-1, it is normal close) * or to send active reset (abort). * Certainly, it is pretty dangerous while synflood, but it is * bad justification for our negligence 8) * To be honest, we are not able to make either * of the variants now. --ANK */ while ((req = reqsk_queue_remove(queue, sk)) != NULL) { struct sock *child = req->sk, *nsk; struct request_sock *nreq; local_bh_disable(); bh_lock_sock(child); WARN_ON(sock_owned_by_user(child)); sock_hold(child); nsk = reuseport_migrate_sock(sk, child, NULL); if (nsk) { nreq = inet_reqsk_clone(req, nsk); if (nreq) { refcount_set(&nreq->rsk_refcnt, 1); if (inet_csk_reqsk_queue_add(nsk, nreq, child)) { __NET_INC_STATS(sock_net(nsk), LINUX_MIB_TCPMIGRATEREQSUCCESS); reqsk_migrate_reset(req); } else { __NET_INC_STATS(sock_net(nsk), LINUX_MIB_TCPMIGRATEREQFAILURE); reqsk_migrate_reset(nreq); __reqsk_free(nreq); } /* inet_csk_reqsk_queue_add() has already * called inet_child_forget() on failure case. */ goto skip_child_forget; } } inet_child_forget(sk, req, child); skip_child_forget: reqsk_put(req); bh_unlock_sock(child); local_bh_enable(); sock_put(child); cond_resched(); } if (queue->fastopenq.rskq_rst_head) { /* Free all the reqs queued in rskq_rst_head. */ spin_lock_bh(&queue->fastopenq.lock); req = queue->fastopenq.rskq_rst_head; queue->fastopenq.rskq_rst_head = NULL; spin_unlock_bh(&queue->fastopenq.lock); while (req != NULL) { next = req->dl_next; reqsk_put(req); req = next; } } WARN_ON_ONCE(sk->sk_ack_backlog); } EXPORT_SYMBOL_GPL(inet_csk_listen_stop); static struct dst_entry *inet_csk_rebuild_route(struct sock *sk, struct flowi *fl) { const struct inet_sock *inet = inet_sk(sk); struct flowi4 *fl4; struct rtable *rt; rcu_read_lock(); fl4 = &fl->u.ip4; inet_sk_init_flowi4(inet, fl4); rt = ip_route_output_flow(sock_net(sk), fl4, sk); if (IS_ERR(rt)) rt = NULL; if (rt) sk_setup_caps(sk, &rt->dst); rcu_read_unlock(); return &rt->dst; } struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu) { struct dst_entry *dst = __sk_dst_check(sk, 0); struct inet_sock *inet = inet_sk(sk); if (!dst) { dst = inet_csk_rebuild_route(sk, &inet->cork.fl); if (!dst) goto out; } dst->ops->update_pmtu(dst, sk, NULL, mtu, true); dst = __sk_dst_check(sk, 0); if (!dst) dst = inet_csk_rebuild_route(sk, &inet->cork.fl); out: return dst; }
1 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 // SPDX-License-Identifier: GPL-2.0+ /* * pcmda12.c * Driver for Winsystems PC-104 based PCM-D/A-12 8-channel AO board. * * COMEDI - Linux Control and Measurement Device Interface * Copyright (C) 2006 Calin A. Culianu <calin@ajvar.org> */ /* * Driver: pcmda12 * Description: A driver for the Winsystems PCM-D/A-12 * Devices: [Winsystems] PCM-D/A-12 (pcmda12) * Author: Calin Culianu <calin@ajvar.org> * Updated: Fri, 13 Jan 2006 12:01:01 -0500 * Status: works * * A driver for the relatively straightforward-to-program PCM-D/A-12. * This board doesn't support commands, and the only way to set its * analog output range is to jumper the board. As such, * comedi_data_write() ignores the range value specified. * * The board uses 16 consecutive I/O addresses starting at the I/O port * base address. Each address corresponds to the LSB then MSB of a * particular channel from 0-7. * * Note that the board is not ISA-PNP capable and thus needs the I/O * port comedi_config parameter. * * Note that passing a nonzero value as the second config option will * enable "simultaneous xfer" mode for this board, in which AO writes * will not take effect until a subsequent read of any AO channel. This * is so that one can speed up programming by preloading all AO registers * with values before simultaneously setting them to take effect with one * read command. * * Configuration Options: * [0] - I/O port base address * [1] - Do Simultaneous Xfer (see description) */ #include <linux/module.h> #include <linux/comedi/comedidev.h> /* AI range is not configurable, it's set by jumpers on the board */ static const struct comedi_lrange pcmda12_ranges = { 3, { UNI_RANGE(5), UNI_RANGE(10), BIP_RANGE(5) } }; struct pcmda12_private { int simultaneous_xfer_mode; }; static int pcmda12_ao_insn_write(struct comedi_device *dev, struct comedi_subdevice *s, struct comedi_insn *insn, unsigned int *data) { struct pcmda12_private *devpriv = dev->private; unsigned int chan = CR_CHAN(insn->chanspec); unsigned int val = s->readback[chan]; unsigned long ioreg = dev->iobase + (chan * 2); int i; for (i = 0; i < insn->n; ++i) { val = data[i]; outb(val & 0xff, ioreg); outb((val >> 8) & 0xff, ioreg + 1); /* * Initiate transfer if not in simultaneaous xfer * mode by reading one of the AO registers. */ if (!devpriv->simultaneous_xfer_mode) inb(ioreg); } s->readback[chan] = val; return insn->n; } static int pcmda12_ao_insn_read(struct comedi_device *dev, struct comedi_subdevice *s, struct comedi_insn *insn, unsigned int *data) { struct pcmda12_private *devpriv = dev->private; /* * Initiate simultaneaous xfer mode by reading one of the * AO registers. All analog outputs will then be updated. */ if (devpriv->simultaneous_xfer_mode) inb(dev->iobase); return comedi_readback_insn_read(dev, s, insn, data); } static void pcmda12_ao_reset(struct comedi_device *dev, struct comedi_subdevice *s) { int i; for (i = 0; i < s->n_chan; ++i) { outb(0, dev->iobase + (i * 2)); outb(0, dev->iobase + (i * 2) + 1); } /* Initiate transfer by reading one of the AO registers. */ inb(dev->iobase); } static int pcmda12_attach(struct comedi_device *dev, struct comedi_devconfig *it) { struct pcmda12_private *devpriv; struct comedi_subdevice *s; int ret; ret = comedi_request_region(dev, it->options[0], 0x10); if (ret) return ret; devpriv = comedi_alloc_devpriv(dev, sizeof(*devpriv)); if (!devpriv) return -ENOMEM; devpriv->simultaneous_xfer_mode = it->options[1]; ret = comedi_alloc_subdevices(dev, 1); if (ret) return ret; s = &dev->subdevices[0]; s->type = COMEDI_SUBD_AO; s->subdev_flags = SDF_READABLE | SDF_WRITABLE; s->n_chan = 8; s->maxdata = 0x0fff; s->range_table = &pcmda12_ranges; s->insn_write = pcmda12_ao_insn_write; s->insn_read = pcmda12_ao_insn_read; ret = comedi_alloc_subdev_readback(s); if (ret) return ret; pcmda12_ao_reset(dev, s); return 0; } static struct comedi_driver pcmda12_driver = { .driver_name = "pcmda12", .module = THIS_MODULE, .attach = pcmda12_attach, .detach = comedi_legacy_detach, }; module_comedi_driver(pcmda12_driver); MODULE_AUTHOR("Comedi https://www.comedi.org"); MODULE_DESCRIPTION("Comedi low-level driver"); MODULE_LICENSE("GPL");
174 163 2 163 163 156 108 155 107 126 155 63 6 2 1 54 54 54 54 54 57 3 54 54 57 57 105 105 53 53 41 41 20 163 142 142 2 139 1 24 47 9 89 38 16 97 53 139 61 61 2 34 24 38 4 56 61 30 30 29 64 64 2 28 35 7 40 7 35 40 64 60 60 1 60 97 5 39 45 30 74 39 1 1 2 1 34 19 15 4 1 25 30 30 27 25 30 4 4 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 // SPDX-License-Identifier: GPL-2.0-or-later /* * OSS compatible sequencer driver * * MIDI device handlers * * Copyright (C) 1998,99 Takashi Iwai <tiwai@suse.de> */ #include <sound/asoundef.h> #include "seq_oss_midi.h" #include "seq_oss_readq.h" #include "seq_oss_timer.h" #include "seq_oss_event.h" #include <sound/seq_midi_event.h> #include "../seq_lock.h" #include <linux/init.h> #include <linux/slab.h> #include <linux/nospec.h> /* * constants */ #define SNDRV_SEQ_OSS_MAX_MIDI_NAME 30 /* * definition of midi device record */ struct seq_oss_midi { int seq_device; /* device number */ int client; /* sequencer client number */ int port; /* sequencer port number */ unsigned int flags; /* port capability */ int opened; /* flag for opening */ unsigned char name[SNDRV_SEQ_OSS_MAX_MIDI_NAME]; struct snd_midi_event *coder; /* MIDI event coder */ struct seq_oss_devinfo *devinfo; /* assigned OSSseq device */ snd_use_lock_t use_lock; struct mutex open_mutex; }; DEFINE_FREE(seq_oss_midi, struct seq_oss_midi *, if (!IS_ERR_OR_NULL(_T)) snd_use_lock_free(&(_T)->use_lock)) /* * midi device table */ static int max_midi_devs; static struct seq_oss_midi *midi_devs[SNDRV_SEQ_OSS_MAX_MIDI_DEVS]; static DEFINE_SPINLOCK(register_lock); /* * prototypes */ static struct seq_oss_midi *get_mdev(int dev); static struct seq_oss_midi *get_mididev(struct seq_oss_devinfo *dp, int dev); static int send_synth_event(struct seq_oss_devinfo *dp, struct snd_seq_event *ev, int dev); static int send_midi_event(struct seq_oss_devinfo *dp, struct snd_seq_event *ev, struct seq_oss_midi *mdev); /* * look up the existing ports * this looks a very exhausting job. */ int snd_seq_oss_midi_lookup_ports(int client) { struct snd_seq_client_info *clinfo __free(kfree) = NULL; struct snd_seq_port_info *pinfo __free(kfree) = NULL; clinfo = kzalloc(sizeof(*clinfo), GFP_KERNEL); pinfo = kzalloc(sizeof(*pinfo), GFP_KERNEL); if (!clinfo || !pinfo) return -ENOMEM; clinfo->client = -1; while (snd_seq_kernel_client_ctl(client, SNDRV_SEQ_IOCTL_QUERY_NEXT_CLIENT, clinfo) == 0) { if (clinfo->client == client) continue; /* ignore myself */ pinfo->addr.client = clinfo->client; pinfo->addr.port = -1; while (snd_seq_kernel_client_ctl(client, SNDRV_SEQ_IOCTL_QUERY_NEXT_PORT, pinfo) == 0) snd_seq_oss_midi_check_new_port(pinfo); } return 0; } /* */ static struct seq_oss_midi * get_mdev(int dev) { struct seq_oss_midi *mdev; guard(spinlock_irqsave)(&register_lock); mdev = midi_devs[dev]; if (mdev) snd_use_lock_use(&mdev->use_lock); return mdev; } /* * look for the identical slot */ static struct seq_oss_midi * find_slot(int client, int port) { int i; struct seq_oss_midi *mdev; guard(spinlock_irqsave)(&register_lock); for (i = 0; i < max_midi_devs; i++) { mdev = midi_devs[i]; if (mdev && mdev->client == client && mdev->port == port) { /* found! */ snd_use_lock_use(&mdev->use_lock); return mdev; } } return NULL; } #define PERM_WRITE (SNDRV_SEQ_PORT_CAP_WRITE|SNDRV_SEQ_PORT_CAP_SUBS_WRITE) #define PERM_READ (SNDRV_SEQ_PORT_CAP_READ|SNDRV_SEQ_PORT_CAP_SUBS_READ) /* * register a new port if it doesn't exist yet */ int snd_seq_oss_midi_check_new_port(struct snd_seq_port_info *pinfo) { int i; struct seq_oss_midi *mdev; /* the port must include generic midi */ if (! (pinfo->type & SNDRV_SEQ_PORT_TYPE_MIDI_GENERIC)) return 0; /* either read or write subscribable */ if ((pinfo->capability & PERM_WRITE) != PERM_WRITE && (pinfo->capability & PERM_READ) != PERM_READ) return 0; /* * look for the identical slot */ mdev = find_slot(pinfo->addr.client, pinfo->addr.port); if (mdev) { /* already exists */ snd_use_lock_free(&mdev->use_lock); return 0; } /* * allocate midi info record */ mdev = kzalloc(sizeof(*mdev), GFP_KERNEL); if (!mdev) return -ENOMEM; /* copy the port information */ mdev->client = pinfo->addr.client; mdev->port = pinfo->addr.port; mdev->flags = pinfo->capability; mdev->opened = 0; snd_use_lock_init(&mdev->use_lock); mutex_init(&mdev->open_mutex); /* copy and truncate the name of synth device */ strscpy(mdev->name, pinfo->name, sizeof(mdev->name)); /* create MIDI coder */ if (snd_midi_event_new(MAX_MIDI_EVENT_BUF, &mdev->coder) < 0) { pr_err("ALSA: seq_oss: can't malloc midi coder\n"); kfree(mdev); return -ENOMEM; } /* OSS sequencer adds running status to all sequences */ snd_midi_event_no_status(mdev->coder, 1); /* * look for en empty slot */ guard(spinlock_irqsave)(&register_lock); for (i = 0; i < max_midi_devs; i++) { if (midi_devs[i] == NULL) break; } if (i >= max_midi_devs) { if (max_midi_devs >= SNDRV_SEQ_OSS_MAX_MIDI_DEVS) { snd_midi_event_free(mdev->coder); kfree(mdev); return -ENOMEM; } max_midi_devs++; } mdev->seq_device = i; midi_devs[mdev->seq_device] = mdev; return 0; } /* * release the midi device if it was registered */ int snd_seq_oss_midi_check_exit_port(int client, int port) { struct seq_oss_midi *mdev; int index; mdev = find_slot(client, port); if (mdev) { scoped_guard(spinlock_irqsave, &register_lock) { midi_devs[mdev->seq_device] = NULL; } snd_use_lock_free(&mdev->use_lock); snd_use_lock_sync(&mdev->use_lock); snd_midi_event_free(mdev->coder); kfree(mdev); } guard(spinlock_irqsave)(&register_lock); for (index = max_midi_devs - 1; index >= 0; index--) { if (midi_devs[index]) break; } max_midi_devs = index + 1; return 0; } /* * release the midi device if it was registered */ void snd_seq_oss_midi_clear_all(void) { int i; struct seq_oss_midi *mdev; guard(spinlock_irqsave)(&register_lock); for (i = 0; i < max_midi_devs; i++) { mdev = midi_devs[i]; if (mdev) { snd_midi_event_free(mdev->coder); kfree(mdev); midi_devs[i] = NULL; } } max_midi_devs = 0; } /* * set up midi tables */ void snd_seq_oss_midi_setup(struct seq_oss_devinfo *dp) { guard(spinlock_irq)(&register_lock); dp->max_mididev = max_midi_devs; } /* * clean up midi tables */ void snd_seq_oss_midi_cleanup(struct seq_oss_devinfo *dp) { int i; for (i = 0; i < dp->max_mididev; i++) snd_seq_oss_midi_close(dp, i); dp->max_mididev = 0; } /* * open all midi devices. ignore errors. */ void snd_seq_oss_midi_open_all(struct seq_oss_devinfo *dp, int file_mode) { int i; for (i = 0; i < dp->max_mididev; i++) snd_seq_oss_midi_open(dp, i, file_mode); } /* * get the midi device information */ static struct seq_oss_midi * get_mididev(struct seq_oss_devinfo *dp, int dev) { if (dev < 0 || dev >= dp->max_mididev) return NULL; dev = array_index_nospec(dev, dp->max_mididev); return get_mdev(dev); } /* * open the midi device if not opened yet */ int snd_seq_oss_midi_open(struct seq_oss_devinfo *dp, int dev, int fmode) { int perm; struct seq_oss_midi *mdev __free(seq_oss_midi) = NULL; struct snd_seq_port_subscribe subs; mdev = get_mididev(dp, dev); if (!mdev) return -ENODEV; guard(mutex)(&mdev->open_mutex); /* already used? */ if (mdev->opened && mdev->devinfo != dp) return -EBUSY; perm = 0; if (is_write_mode(fmode)) perm |= PERM_WRITE; if (is_read_mode(fmode)) perm |= PERM_READ; perm &= mdev->flags; if (perm == 0) return -ENXIO; /* already opened? */ if ((mdev->opened & perm) == perm) return 0; perm &= ~mdev->opened; memset(&subs, 0, sizeof(subs)); if (perm & PERM_WRITE) { subs.sender = dp->addr; subs.dest.client = mdev->client; subs.dest.port = mdev->port; if (snd_seq_kernel_client_ctl(dp->cseq, SNDRV_SEQ_IOCTL_SUBSCRIBE_PORT, &subs) >= 0) mdev->opened |= PERM_WRITE; } if (perm & PERM_READ) { subs.sender.client = mdev->client; subs.sender.port = mdev->port; subs.dest = dp->addr; subs.flags = SNDRV_SEQ_PORT_SUBS_TIMESTAMP; subs.queue = dp->queue; /* queue for timestamps */ if (snd_seq_kernel_client_ctl(dp->cseq, SNDRV_SEQ_IOCTL_SUBSCRIBE_PORT, &subs) >= 0) mdev->opened |= PERM_READ; } if (!mdev->opened) return -ENXIO; mdev->devinfo = dp; return 0; } /* * close the midi device if already opened */ int snd_seq_oss_midi_close(struct seq_oss_devinfo *dp, int dev) { struct seq_oss_midi *mdev __free(seq_oss_midi) = NULL; struct snd_seq_port_subscribe subs; mdev = get_mididev(dp, dev); if (!mdev) return -ENODEV; guard(mutex)(&mdev->open_mutex); if (!mdev->opened || mdev->devinfo != dp) return 0; memset(&subs, 0, sizeof(subs)); if (mdev->opened & PERM_WRITE) { subs.sender = dp->addr; subs.dest.client = mdev->client; subs.dest.port = mdev->port; snd_seq_kernel_client_ctl(dp->cseq, SNDRV_SEQ_IOCTL_UNSUBSCRIBE_PORT, &subs); } if (mdev->opened & PERM_READ) { subs.sender.client = mdev->client; subs.sender.port = mdev->port; subs.dest = dp->addr; snd_seq_kernel_client_ctl(dp->cseq, SNDRV_SEQ_IOCTL_UNSUBSCRIBE_PORT, &subs); } mdev->opened = 0; mdev->devinfo = NULL; return 0; } /* * change seq capability flags to file mode flags */ int snd_seq_oss_midi_filemode(struct seq_oss_devinfo *dp, int dev) { struct seq_oss_midi *mdev __free(seq_oss_midi) = NULL; int mode; mdev = get_mididev(dp, dev); if (!mdev) return 0; mode = 0; if (mdev->opened & PERM_WRITE) mode |= SNDRV_SEQ_OSS_FILE_WRITE; if (mdev->opened & PERM_READ) mode |= SNDRV_SEQ_OSS_FILE_READ; return mode; } /* * reset the midi device and close it: * so far, only close the device. */ void snd_seq_oss_midi_reset(struct seq_oss_devinfo *dp, int dev) { struct seq_oss_midi *mdev __free(seq_oss_midi) = NULL; mdev = get_mididev(dp, dev); if (!mdev) return; if (!mdev->opened) return; if (mdev->opened & PERM_WRITE) { struct snd_seq_event ev; int c; memset(&ev, 0, sizeof(ev)); ev.dest.client = mdev->client; ev.dest.port = mdev->port; ev.queue = dp->queue; ev.source.port = dp->port; if (dp->seq_mode == SNDRV_SEQ_OSS_MODE_SYNTH) { ev.type = SNDRV_SEQ_EVENT_SENSING; snd_seq_oss_dispatch(dp, &ev, 0, 0); } for (c = 0; c < 16; c++) { ev.type = SNDRV_SEQ_EVENT_CONTROLLER; ev.data.control.channel = c; ev.data.control.param = MIDI_CTL_ALL_NOTES_OFF; snd_seq_oss_dispatch(dp, &ev, 0, 0); if (dp->seq_mode == SNDRV_SEQ_OSS_MODE_MUSIC) { ev.data.control.param = MIDI_CTL_RESET_CONTROLLERS; snd_seq_oss_dispatch(dp, &ev, 0, 0); ev.type = SNDRV_SEQ_EVENT_PITCHBEND; ev.data.control.value = 0; snd_seq_oss_dispatch(dp, &ev, 0, 0); } } } // snd_seq_oss_midi_close(dp, dev); } /* * get client/port of the specified MIDI device */ void snd_seq_oss_midi_get_addr(struct seq_oss_devinfo *dp, int dev, struct snd_seq_addr *addr) { struct seq_oss_midi *mdev __free(seq_oss_midi) = NULL; mdev = get_mididev(dp, dev); if (!mdev) return; addr->client = mdev->client; addr->port = mdev->port; } /* * input callback - this can be atomic */ int snd_seq_oss_midi_input(struct snd_seq_event *ev, int direct, void *private_data) { struct seq_oss_devinfo *dp = (struct seq_oss_devinfo *)private_data; struct seq_oss_midi *mdev __free(seq_oss_midi) = NULL; if (dp->readq == NULL) return 0; mdev = find_slot(ev->source.client, ev->source.port); if (!mdev) return 0; if (!(mdev->opened & PERM_READ)) return 0; if (dp->seq_mode == SNDRV_SEQ_OSS_MODE_MUSIC) return send_synth_event(dp, ev, mdev->seq_device); else return send_midi_event(dp, ev, mdev); } /* * convert ALSA sequencer event to OSS synth event */ static int send_synth_event(struct seq_oss_devinfo *dp, struct snd_seq_event *ev, int dev) { union evrec ossev; memset(&ossev, 0, sizeof(ossev)); switch (ev->type) { case SNDRV_SEQ_EVENT_NOTEON: ossev.v.cmd = MIDI_NOTEON; break; case SNDRV_SEQ_EVENT_NOTEOFF: ossev.v.cmd = MIDI_NOTEOFF; break; case SNDRV_SEQ_EVENT_KEYPRESS: ossev.v.cmd = MIDI_KEY_PRESSURE; break; case SNDRV_SEQ_EVENT_CONTROLLER: ossev.l.cmd = MIDI_CTL_CHANGE; break; case SNDRV_SEQ_EVENT_PGMCHANGE: ossev.l.cmd = MIDI_PGM_CHANGE; break; case SNDRV_SEQ_EVENT_CHANPRESS: ossev.l.cmd = MIDI_CHN_PRESSURE; break; case SNDRV_SEQ_EVENT_PITCHBEND: ossev.l.cmd = MIDI_PITCH_BEND; break; default: return 0; /* not supported */ } ossev.v.dev = dev; switch (ev->type) { case SNDRV_SEQ_EVENT_NOTEON: case SNDRV_SEQ_EVENT_NOTEOFF: case SNDRV_SEQ_EVENT_KEYPRESS: ossev.v.code = EV_CHN_VOICE; ossev.v.note = ev->data.note.note; ossev.v.parm = ev->data.note.velocity; ossev.v.chn = ev->data.note.channel; break; case SNDRV_SEQ_EVENT_CONTROLLER: case SNDRV_SEQ_EVENT_PGMCHANGE: case SNDRV_SEQ_EVENT_CHANPRESS: ossev.l.code = EV_CHN_COMMON; ossev.l.p1 = ev->data.control.param; ossev.l.val = ev->data.control.value; ossev.l.chn = ev->data.control.channel; break; case SNDRV_SEQ_EVENT_PITCHBEND: ossev.l.code = EV_CHN_COMMON; ossev.l.val = ev->data.control.value + 8192; ossev.l.chn = ev->data.control.channel; break; } snd_seq_oss_readq_put_timestamp(dp->readq, ev->time.tick, dp->seq_mode); snd_seq_oss_readq_put_event(dp->readq, &ossev); return 0; } /* * decode event and send MIDI bytes to read queue */ static int send_midi_event(struct seq_oss_devinfo *dp, struct snd_seq_event *ev, struct seq_oss_midi *mdev) { char msg[32]; int len; snd_seq_oss_readq_put_timestamp(dp->readq, ev->time.tick, dp->seq_mode); if (!dp->timer->running) len = snd_seq_oss_timer_start(dp->timer); if (ev->type == SNDRV_SEQ_EVENT_SYSEX) { snd_seq_oss_readq_sysex(dp->readq, mdev->seq_device, ev); snd_midi_event_reset_decode(mdev->coder); } else { len = snd_midi_event_decode(mdev->coder, msg, sizeof(msg), ev); if (len > 0) snd_seq_oss_readq_puts(dp->readq, mdev->seq_device, msg, len); } return 0; } /* * dump midi data * return 0 : enqueued * non-zero : invalid - ignored */ int snd_seq_oss_midi_putc(struct seq_oss_devinfo *dp, int dev, unsigned char c, struct snd_seq_event *ev) { struct seq_oss_midi *mdev __free(seq_oss_midi) = NULL; mdev = get_mididev(dp, dev); if (!mdev) return -ENODEV; if (snd_midi_event_encode_byte(mdev->coder, c, ev)) { snd_seq_oss_fill_addr(dp, ev, mdev->client, mdev->port); return 0; } return -EINVAL; } /* * create OSS compatible midi_info record */ int snd_seq_oss_midi_make_info(struct seq_oss_devinfo *dp, int dev, struct midi_info *inf) { struct seq_oss_midi *mdev __free(seq_oss_midi) = NULL; mdev = get_mididev(dp, dev); if (!mdev) return -ENXIO; inf->device = dev; inf->dev_type = 0; /* FIXME: ?? */ inf->capabilities = 0; /* FIXME: ?? */ strscpy(inf->name, mdev->name, sizeof(inf->name)); return 0; } #ifdef CONFIG_SND_PROC_FS /* * proc interface */ static char * capmode_str(int val) { val &= PERM_READ|PERM_WRITE; if (val == (PERM_READ|PERM_WRITE)) return "read/write"; else if (val == PERM_READ) return "read"; else if (val == PERM_WRITE) return "write"; else return "none"; } void snd_seq_oss_midi_info_read(struct snd_info_buffer *buf) { int i; snd_iprintf(buf, "\nNumber of MIDI devices: %d\n", max_midi_devs); for (i = 0; i < max_midi_devs; i++) { struct seq_oss_midi *mdev __free(seq_oss_midi) = NULL; snd_iprintf(buf, "\nmidi %d: ", i); mdev = get_mdev(i); if (mdev == NULL) { snd_iprintf(buf, "*empty*\n"); continue; } snd_iprintf(buf, "[%s] ALSA port %d:%d\n", mdev->name, mdev->client, mdev->port); snd_iprintf(buf, " capability %s / opened %s\n", capmode_str(mdev->flags), capmode_str(mdev->opened)); } } #endif /* CONFIG_SND_PROC_FS */
4 4 4 4 4 2 2 2 9 9 9 5 5 5 5 9 9 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 // SPDX-License-Identifier: GPL-2.0 /* * arch-independent dma-mapping routines * * Copyright (c) 2006 SUSE Linux Products GmbH * Copyright (c) 2006 Tejun Heo <teheo@suse.de> */ #include <linux/memblock.h> /* for max_pfn */ #include <linux/acpi.h> #include <linux/dma-map-ops.h> #include <linux/export.h> #include <linux/gfp.h> #include <linux/iommu-dma.h> #include <linux/kmsan.h> #include <linux/of_device.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include "debug.h" #include "direct.h" #define CREATE_TRACE_POINTS #include <trace/events/dma.h> #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \ defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) bool dma_default_coherent = IS_ENABLED(CONFIG_ARCH_DMA_DEFAULT_COHERENT); #endif /* * Managed DMA API */ struct dma_devres { size_t size; void *vaddr; dma_addr_t dma_handle; unsigned long attrs; }; static void dmam_release(struct device *dev, void *res) { struct dma_devres *this = res; dma_free_attrs(dev, this->size, this->vaddr, this->dma_handle, this->attrs); } static int dmam_match(struct device *dev, void *res, void *match_data) { struct dma_devres *this = res, *match = match_data; if (this->vaddr == match->vaddr) { WARN_ON(this->size != match->size || this->dma_handle != match->dma_handle); return 1; } return 0; } /** * dmam_free_coherent - Managed dma_free_coherent() * @dev: Device to free coherent memory for * @size: Size of allocation * @vaddr: Virtual address of the memory to free * @dma_handle: DMA handle of the memory to free * * Managed dma_free_coherent(). */ void dmam_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle) { struct dma_devres match_data = { size, vaddr, dma_handle }; WARN_ON(devres_destroy(dev, dmam_release, dmam_match, &match_data)); dma_free_coherent(dev, size, vaddr, dma_handle); } EXPORT_SYMBOL(dmam_free_coherent); /** * dmam_alloc_attrs - Managed dma_alloc_attrs() * @dev: Device to allocate non_coherent memory for * @size: Size of allocation * @dma_handle: Out argument for allocated DMA handle * @gfp: Allocation flags * @attrs: Flags in the DMA_ATTR_* namespace. * * Managed dma_alloc_attrs(). Memory allocated using this function will be * automatically released on driver detach. * * RETURNS: * Pointer to allocated memory on success, NULL on failure. */ void *dmam_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) { struct dma_devres *dr; void *vaddr; dr = devres_alloc(dmam_release, sizeof(*dr), gfp); if (!dr) return NULL; vaddr = dma_alloc_attrs(dev, size, dma_handle, gfp, attrs); if (!vaddr) { devres_free(dr); return NULL; } dr->vaddr = vaddr; dr->dma_handle = *dma_handle; dr->size = size; dr->attrs = attrs; devres_add(dev, dr); return vaddr; } EXPORT_SYMBOL(dmam_alloc_attrs); static bool dma_go_direct(struct device *dev, dma_addr_t mask, const struct dma_map_ops *ops) { if (use_dma_iommu(dev)) return false; if (likely(!ops)) return true; #ifdef CONFIG_DMA_OPS_BYPASS if (dev->dma_ops_bypass) return min_not_zero(mask, dev->bus_dma_limit) >= dma_direct_get_required_mask(dev); #endif return false; } /* * Check if the devices uses a direct mapping for streaming DMA operations. * This allows IOMMU drivers to set a bypass mode if the DMA mask is large * enough. */ static inline bool dma_alloc_direct(struct device *dev, const struct dma_map_ops *ops) { return dma_go_direct(dev, dev->coherent_dma_mask, ops); } static inline bool dma_map_direct(struct device *dev, const struct dma_map_ops *ops) { return dma_go_direct(dev, *dev->dma_mask, ops); } dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, size_t size, enum dma_data_direction dir, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); bool is_mmio = attrs & DMA_ATTR_MMIO; dma_addr_t addr; BUG_ON(!valid_dma_direction(dir)); if (WARN_ON_ONCE(!dev->dma_mask)) return DMA_MAPPING_ERROR; if (dma_map_direct(dev, ops) || (!is_mmio && arch_dma_map_phys_direct(dev, phys + size))) addr = dma_direct_map_phys(dev, phys, size, dir, attrs); else if (use_dma_iommu(dev)) addr = iommu_dma_map_phys(dev, phys, size, dir, attrs); else if (is_mmio) { if (!ops->map_resource) return DMA_MAPPING_ERROR; addr = ops->map_resource(dev, phys, size, dir, attrs); } else { struct page *page = phys_to_page(phys); size_t offset = offset_in_page(phys); /* * The dma_ops API contract for ops->map_page() requires * kmappable memory, while ops->map_resource() does not. */ addr = ops->map_page(dev, page, offset, size, dir, attrs); } if (!is_mmio) kmsan_handle_dma(phys, size, dir); trace_dma_map_phys(dev, phys, addr, size, dir, attrs); debug_dma_map_phys(dev, phys, size, dir, addr, attrs); return addr; } EXPORT_SYMBOL_GPL(dma_map_phys); dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, size_t offset, size_t size, enum dma_data_direction dir, unsigned long attrs) { phys_addr_t phys = page_to_phys(page) + offset; if (unlikely(attrs & DMA_ATTR_MMIO)) return DMA_MAPPING_ERROR; if (IS_ENABLED(CONFIG_DMA_API_DEBUG) && WARN_ON_ONCE(is_zone_device_page(page))) return DMA_MAPPING_ERROR; return dma_map_phys(dev, phys, size, dir, attrs); } EXPORT_SYMBOL(dma_map_page_attrs); void dma_unmap_phys(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); bool is_mmio = attrs & DMA_ATTR_MMIO; BUG_ON(!valid_dma_direction(dir)); if (dma_map_direct(dev, ops) || (!is_mmio && arch_dma_unmap_phys_direct(dev, addr + size))) dma_direct_unmap_phys(dev, addr, size, dir, attrs); else if (use_dma_iommu(dev)) iommu_dma_unmap_phys(dev, addr, size, dir, attrs); else if (is_mmio) { if (ops->unmap_resource) ops->unmap_resource(dev, addr, size, dir, attrs); } else ops->unmap_page(dev, addr, size, dir, attrs); trace_dma_unmap_phys(dev, addr, size, dir, attrs); debug_dma_unmap_phys(dev, addr, size, dir); } EXPORT_SYMBOL_GPL(dma_unmap_phys); void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { if (unlikely(attrs & DMA_ATTR_MMIO)) return; dma_unmap_phys(dev, addr, size, dir, attrs); } EXPORT_SYMBOL(dma_unmap_page_attrs); static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); int ents; BUG_ON(!valid_dma_direction(dir)); if (WARN_ON_ONCE(!dev->dma_mask)) return 0; if (dma_map_direct(dev, ops) || arch_dma_map_sg_direct(dev, sg, nents)) ents = dma_direct_map_sg(dev, sg, nents, dir, attrs); else if (use_dma_iommu(dev)) ents = iommu_dma_map_sg(dev, sg, nents, dir, attrs); else ents = ops->map_sg(dev, sg, nents, dir, attrs); if (ents > 0) { kmsan_handle_dma_sg(sg, nents, dir); trace_dma_map_sg(dev, sg, nents, ents, dir, attrs); debug_dma_map_sg(dev, sg, nents, ents, dir, attrs); } else if (WARN_ON_ONCE(ents != -EINVAL && ents != -ENOMEM && ents != -EIO && ents != -EREMOTEIO)) { trace_dma_map_sg_err(dev, sg, nents, ents, dir, attrs); return -EIO; } return ents; } /** * dma_map_sg_attrs - Map the given buffer for DMA * @dev: The device for which to perform the DMA operation * @sg: The sg_table object describing the buffer * @nents: Number of entries to map * @dir: DMA direction * @attrs: Optional DMA attributes for the map operation * * Maps a buffer described by a scatterlist passed in the sg argument with * nents segments for the @dir DMA operation by the @dev device. * * Returns the number of mapped entries (which can be less than nents) * on success. Zero is returned for any error. * * dma_unmap_sg_attrs() should be used to unmap the buffer with the * original sg and original nents (not the value returned by this funciton). */ unsigned int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs) { int ret; ret = __dma_map_sg_attrs(dev, sg, nents, dir, attrs); if (ret < 0) return 0; return ret; } EXPORT_SYMBOL(dma_map_sg_attrs); /** * dma_map_sgtable - Map the given buffer for DMA * @dev: The device for which to perform the DMA operation * @sgt: The sg_table object describing the buffer * @dir: DMA direction * @attrs: Optional DMA attributes for the map operation * * Maps a buffer described by a scatterlist stored in the given sg_table * object for the @dir DMA operation by the @dev device. After success, the * ownership for the buffer is transferred to the DMA domain. One has to * call dma_sync_sgtable_for_cpu() or dma_unmap_sgtable() to move the * ownership of the buffer back to the CPU domain before touching the * buffer by the CPU. * * Returns 0 on success or a negative error code on error. The following * error codes are supported with the given meaning: * * -EINVAL An invalid argument, unaligned access or other error * in usage. Will not succeed if retried. * -ENOMEM Insufficient resources (like memory or IOVA space) to * complete the mapping. Should succeed if retried later. * -EIO Legacy error code with an unknown meaning. eg. this is * returned if a lower level call returned * DMA_MAPPING_ERROR. * -EREMOTEIO The DMA device cannot access P2PDMA memory specified * in the sg_table. This will not succeed if retried. */ int dma_map_sgtable(struct device *dev, struct sg_table *sgt, enum dma_data_direction dir, unsigned long attrs) { int nents; nents = __dma_map_sg_attrs(dev, sgt->sgl, sgt->orig_nents, dir, attrs); if (nents < 0) return nents; sgt->nents = nents; return 0; } EXPORT_SYMBOL_GPL(dma_map_sgtable); void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); trace_dma_unmap_sg(dev, sg, nents, dir, attrs); debug_dma_unmap_sg(dev, sg, nents, dir); if (dma_map_direct(dev, ops) || arch_dma_unmap_sg_direct(dev, sg, nents)) dma_direct_unmap_sg(dev, sg, nents, dir, attrs); else if (use_dma_iommu(dev)) iommu_dma_unmap_sg(dev, sg, nents, dir, attrs); else if (ops->unmap_sg) ops->unmap_sg(dev, sg, nents, dir, attrs); } EXPORT_SYMBOL(dma_unmap_sg_attrs); dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { if (IS_ENABLED(CONFIG_DMA_API_DEBUG) && WARN_ON_ONCE(pfn_valid(PHYS_PFN(phys_addr)))) return DMA_MAPPING_ERROR; return dma_map_phys(dev, phys_addr, size, dir, attrs | DMA_ATTR_MMIO); } EXPORT_SYMBOL(dma_map_resource); void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { dma_unmap_phys(dev, addr, size, dir, attrs | DMA_ATTR_MMIO); } EXPORT_SYMBOL(dma_unmap_resource); #ifdef CONFIG_DMA_NEED_SYNC void __dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir) { const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); if (dma_map_direct(dev, ops)) dma_direct_sync_single_for_cpu(dev, addr, size, dir); else if (use_dma_iommu(dev)) iommu_dma_sync_single_for_cpu(dev, addr, size, dir); else if (ops->sync_single_for_cpu) ops->sync_single_for_cpu(dev, addr, size, dir); trace_dma_sync_single_for_cpu(dev, addr, size, dir); debug_dma_sync_single_for_cpu(dev, addr, size, dir); } EXPORT_SYMBOL(__dma_sync_single_for_cpu); void __dma_sync_single_for_device(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir) { const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); if (dma_map_direct(dev, ops)) dma_direct_sync_single_for_device(dev, addr, size, dir); else if (use_dma_iommu(dev)) iommu_dma_sync_single_for_device(dev, addr, size, dir); else if (ops->sync_single_for_device) ops->sync_single_for_device(dev, addr, size, dir); trace_dma_sync_single_for_device(dev, addr, size, dir); debug_dma_sync_single_for_device(dev, addr, size, dir); } EXPORT_SYMBOL(__dma_sync_single_for_device); void __dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, enum dma_data_direction dir) { const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); if (dma_map_direct(dev, ops)) dma_direct_sync_sg_for_cpu(dev, sg, nelems, dir); else if (use_dma_iommu(dev)) iommu_dma_sync_sg_for_cpu(dev, sg, nelems, dir); else if (ops->sync_sg_for_cpu) ops->sync_sg_for_cpu(dev, sg, nelems, dir); trace_dma_sync_sg_for_cpu(dev, sg, nelems, dir); debug_dma_sync_sg_for_cpu(dev, sg, nelems, dir); } EXPORT_SYMBOL(__dma_sync_sg_for_cpu); void __dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, enum dma_data_direction dir) { const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); if (dma_map_direct(dev, ops)) dma_direct_sync_sg_for_device(dev, sg, nelems, dir); else if (use_dma_iommu(dev)) iommu_dma_sync_sg_for_device(dev, sg, nelems, dir); else if (ops->sync_sg_for_device) ops->sync_sg_for_device(dev, sg, nelems, dir); trace_dma_sync_sg_for_device(dev, sg, nelems, dir); debug_dma_sync_sg_for_device(dev, sg, nelems, dir); } EXPORT_SYMBOL(__dma_sync_sg_for_device); bool __dma_need_sync(struct device *dev, dma_addr_t dma_addr) { const struct dma_map_ops *ops = get_dma_ops(dev); if (dma_map_direct(dev, ops)) /* * dma_skip_sync could've been reset on first SWIOTLB buffer * mapping, but @dma_addr is not necessary an SWIOTLB buffer. * In this case, fall back to more granular check. */ return dma_direct_need_sync(dev, dma_addr); return true; } EXPORT_SYMBOL_GPL(__dma_need_sync); /** * dma_need_unmap - does this device need dma_unmap_* operations * @dev: device to check * * If this function returns %false, drivers can skip calling dma_unmap_* after * finishing an I/O. This function must be called after all mappings that might * need to be unmapped have been performed. */ bool dma_need_unmap(struct device *dev) { if (!dma_map_direct(dev, get_dma_ops(dev))) return true; if (!dev->dma_skip_sync) return true; return IS_ENABLED(CONFIG_DMA_API_DEBUG); } EXPORT_SYMBOL_GPL(dma_need_unmap); static void dma_setup_need_sync(struct device *dev) { const struct dma_map_ops *ops = get_dma_ops(dev); if (dma_map_direct(dev, ops) || use_dma_iommu(dev)) /* * dma_skip_sync will be reset to %false on first SWIOTLB buffer * mapping, if any. During the device initialization, it's * enough to check only for the DMA coherence. */ dev->dma_skip_sync = dev_is_dma_coherent(dev); else if (!ops->sync_single_for_device && !ops->sync_single_for_cpu && !ops->sync_sg_for_device && !ops->sync_sg_for_cpu) /* * Synchronization is not possible when none of DMA sync ops * is set. */ dev->dma_skip_sync = true; else dev->dma_skip_sync = false; } #else /* !CONFIG_DMA_NEED_SYNC */ static inline void dma_setup_need_sync(struct device *dev) { } #endif /* !CONFIG_DMA_NEED_SYNC */ /* * The whole dma_get_sgtable() idea is fundamentally unsafe - it seems * that the intention is to allow exporting memory allocated via the * coherent DMA APIs through the dma_buf API, which only accepts a * scattertable. This presents a couple of problems: * 1. Not all memory allocated via the coherent DMA APIs is backed by * a struct page * 2. Passing coherent DMA memory into the streaming APIs is not allowed * as we will try to flush the memory through a different alias to that * actually being used (and the flushes are redundant.) */ int dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); if (dma_alloc_direct(dev, ops)) return dma_direct_get_sgtable(dev, sgt, cpu_addr, dma_addr, size, attrs); if (use_dma_iommu(dev)) return iommu_dma_get_sgtable(dev, sgt, cpu_addr, dma_addr, size, attrs); if (!ops->get_sgtable) return -ENXIO; return ops->get_sgtable(dev, sgt, cpu_addr, dma_addr, size, attrs); } EXPORT_SYMBOL(dma_get_sgtable_attrs); #ifdef CONFIG_MMU /* * Return the page attributes used for mapping dma_alloc_* memory, either in * kernel space if remapping is needed, or to userspace through dma_mmap_*. */ pgprot_t dma_pgprot(struct device *dev, pgprot_t prot, unsigned long attrs) { if (dev_is_dma_coherent(dev)) return prot; #ifdef CONFIG_ARCH_HAS_DMA_WRITE_COMBINE if (attrs & DMA_ATTR_WRITE_COMBINE) return pgprot_writecombine(prot); #endif return pgprot_dmacoherent(prot); } #endif /* CONFIG_MMU */ /** * dma_can_mmap - check if a given device supports dma_mmap_* * @dev: device to check * * Returns %true if @dev supports dma_mmap_coherent() and dma_mmap_attrs() to * map DMA allocations to userspace. */ bool dma_can_mmap(struct device *dev) { const struct dma_map_ops *ops = get_dma_ops(dev); if (dma_alloc_direct(dev, ops)) return dma_direct_can_mmap(dev); if (use_dma_iommu(dev)) return true; return ops->mmap != NULL; } EXPORT_SYMBOL_GPL(dma_can_mmap); /** * dma_mmap_attrs - map a coherent DMA allocation into user space * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices * @vma: vm_area_struct describing requested user mapping * @cpu_addr: kernel CPU-view address returned from dma_alloc_attrs * @dma_addr: device-view address returned from dma_alloc_attrs * @size: size of memory originally requested in dma_alloc_attrs * @attrs: attributes of mapping properties requested in dma_alloc_attrs * * Map a coherent DMA buffer previously allocated by dma_alloc_attrs into user * space. The coherent DMA buffer must not be freed by the driver until the * user space mapping has been released. */ int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); if (dma_alloc_direct(dev, ops)) return dma_direct_mmap(dev, vma, cpu_addr, dma_addr, size, attrs); if (use_dma_iommu(dev)) return iommu_dma_mmap(dev, vma, cpu_addr, dma_addr, size, attrs); if (!ops->mmap) return -ENXIO; return ops->mmap(dev, vma, cpu_addr, dma_addr, size, attrs); } EXPORT_SYMBOL(dma_mmap_attrs); u64 dma_get_required_mask(struct device *dev) { const struct dma_map_ops *ops = get_dma_ops(dev); if (dma_alloc_direct(dev, ops)) return dma_direct_get_required_mask(dev); if (use_dma_iommu(dev)) return DMA_BIT_MASK(32); if (ops->get_required_mask) return ops->get_required_mask(dev); /* * We require every DMA ops implementation to at least support a 32-bit * DMA mask (and use bounce buffering if that isn't supported in * hardware). As the direct mapping code has its own routine to * actually report an optimal mask we default to 32-bit here as that * is the right thing for most IOMMUs, and at least not actively * harmful in general. */ return DMA_BIT_MASK(32); } EXPORT_SYMBOL_GPL(dma_get_required_mask); void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); void *cpu_addr; WARN_ON_ONCE(!dev->coherent_dma_mask); /* * DMA allocations can never be turned back into a page pointer, so * requesting compound pages doesn't make sense (and can't even be * supported at all by various backends). */ if (WARN_ON_ONCE(flag & __GFP_COMP)) return NULL; if (dma_alloc_from_dev_coherent(dev, size, dma_handle, &cpu_addr)) { trace_dma_alloc(dev, cpu_addr, *dma_handle, size, DMA_BIDIRECTIONAL, flag, attrs); return cpu_addr; } /* let the implementation decide on the zone to allocate from: */ flag &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM); if (dma_alloc_direct(dev, ops)) { cpu_addr = dma_direct_alloc(dev, size, dma_handle, flag, attrs); } else if (use_dma_iommu(dev)) { cpu_addr = iommu_dma_alloc(dev, size, dma_handle, flag, attrs); } else if (ops->alloc) { cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs); } else { trace_dma_alloc(dev, NULL, 0, size, DMA_BIDIRECTIONAL, flag, attrs); return NULL; } trace_dma_alloc(dev, cpu_addr, *dma_handle, size, DMA_BIDIRECTIONAL, flag, attrs); debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr, attrs); return cpu_addr; } EXPORT_SYMBOL(dma_alloc_attrs); void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_handle, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); if (dma_release_from_dev_coherent(dev, get_order(size), cpu_addr)) return; /* * On non-coherent platforms which implement DMA-coherent buffers via * non-cacheable remaps, ops->free() may call vunmap(). Thus getting * this far in IRQ context is a) at risk of a BUG_ON() or trying to * sleep on some machines, and b) an indication that the driver is * probably misusing the coherent API anyway. */ WARN_ON(irqs_disabled()); trace_dma_free(dev, cpu_addr, dma_handle, size, DMA_BIDIRECTIONAL, attrs); if (!cpu_addr) return; debug_dma_free_coherent(dev, size, cpu_addr, dma_handle); if (dma_alloc_direct(dev, ops)) dma_direct_free(dev, size, cpu_addr, dma_handle, attrs); else if (use_dma_iommu(dev)) iommu_dma_free(dev, size, cpu_addr, dma_handle, attrs); else if (ops->free) ops->free(dev, size, cpu_addr, dma_handle, attrs); } EXPORT_SYMBOL(dma_free_attrs); static struct page *__dma_alloc_pages(struct device *dev, size_t size, dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp) { const struct dma_map_ops *ops = get_dma_ops(dev); if (WARN_ON_ONCE(!dev->coherent_dma_mask)) return NULL; if (WARN_ON_ONCE(gfp & (__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM))) return NULL; if (WARN_ON_ONCE(gfp & __GFP_COMP)) return NULL; size = PAGE_ALIGN(size); if (dma_alloc_direct(dev, ops)) return dma_direct_alloc_pages(dev, size, dma_handle, dir, gfp); if (use_dma_iommu(dev)) return dma_common_alloc_pages(dev, size, dma_handle, dir, gfp); if (!ops->alloc_pages_op) return NULL; return ops->alloc_pages_op(dev, size, dma_handle, dir, gfp); } struct page *dma_alloc_pages(struct device *dev, size_t size, dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp) { struct page *page = __dma_alloc_pages(dev, size, dma_handle, dir, gfp); if (page) { trace_dma_alloc_pages(dev, page_to_virt(page), *dma_handle, size, dir, gfp, 0); debug_dma_alloc_pages(dev, page, size, dir, *dma_handle, 0); } else { trace_dma_alloc_pages(dev, NULL, 0, size, dir, gfp, 0); } return page; } EXPORT_SYMBOL_GPL(dma_alloc_pages); static void __dma_free_pages(struct device *dev, size_t size, struct page *page, dma_addr_t dma_handle, enum dma_data_direction dir) { const struct dma_map_ops *ops = get_dma_ops(dev); size = PAGE_ALIGN(size); if (dma_alloc_direct(dev, ops)) dma_direct_free_pages(dev, size, page, dma_handle, dir); else if (use_dma_iommu(dev)) dma_common_free_pages(dev, size, page, dma_handle, dir); else if (ops->free_pages) ops->free_pages(dev, size, page, dma_handle, dir); } void dma_free_pages(struct device *dev, size_t size, struct page *page, dma_addr_t dma_handle, enum dma_data_direction dir) { trace_dma_free_pages(dev, page_to_virt(page), dma_handle, size, dir, 0); debug_dma_free_pages(dev, page, size, dir, dma_handle); __dma_free_pages(dev, size, page, dma_handle, dir); } EXPORT_SYMBOL_GPL(dma_free_pages); int dma_mmap_pages(struct device *dev, struct vm_area_struct *vma, size_t size, struct page *page) { unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT; if (vma->vm_pgoff >= count || vma_pages(vma) > count - vma->vm_pgoff) return -ENXIO; return remap_pfn_range(vma, vma->vm_start, page_to_pfn(page) + vma->vm_pgoff, vma_pages(vma) << PAGE_SHIFT, vma->vm_page_prot); } EXPORT_SYMBOL_GPL(dma_mmap_pages); static struct sg_table *alloc_single_sgt(struct device *dev, size_t size, enum dma_data_direction dir, gfp_t gfp) { struct sg_table *sgt; struct page *page; sgt = kmalloc(sizeof(*sgt), gfp); if (!sgt) return NULL; if (sg_alloc_table(sgt, 1, gfp)) goto out_free_sgt; page = __dma_alloc_pages(dev, size, &sgt->sgl->dma_address, dir, gfp); if (!page) goto out_free_table; sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0); sg_dma_len(sgt->sgl) = sgt->sgl->length; return sgt; out_free_table: sg_free_table(sgt); out_free_sgt: kfree(sgt); return NULL; } struct sg_table *dma_alloc_noncontiguous(struct device *dev, size_t size, enum dma_data_direction dir, gfp_t gfp, unsigned long attrs) { struct sg_table *sgt; if (WARN_ON_ONCE(attrs & ~DMA_ATTR_ALLOC_SINGLE_PAGES)) return NULL; if (WARN_ON_ONCE(gfp & __GFP_COMP)) return NULL; if (use_dma_iommu(dev)) sgt = iommu_dma_alloc_noncontiguous(dev, size, dir, gfp, attrs); else sgt = alloc_single_sgt(dev, size, dir, gfp); if (sgt) { sgt->nents = 1; trace_dma_alloc_sgt(dev, sgt, size, dir, gfp, attrs); debug_dma_map_sg(dev, sgt->sgl, sgt->orig_nents, 1, dir, attrs); } else { trace_dma_alloc_sgt_err(dev, NULL, 0, size, dir, gfp, attrs); } return sgt; } EXPORT_SYMBOL_GPL(dma_alloc_noncontiguous); static void free_single_sgt(struct device *dev, size_t size, struct sg_table *sgt, enum dma_data_direction dir) { __dma_free_pages(dev, size, sg_page(sgt->sgl), sgt->sgl->dma_address, dir); sg_free_table(sgt); kfree(sgt); } void dma_free_noncontiguous(struct device *dev, size_t size, struct sg_table *sgt, enum dma_data_direction dir) { trace_dma_free_sgt(dev, sgt, size, dir); debug_dma_unmap_sg(dev, sgt->sgl, sgt->orig_nents, dir); if (use_dma_iommu(dev)) iommu_dma_free_noncontiguous(dev, size, sgt, dir); else free_single_sgt(dev, size, sgt, dir); } EXPORT_SYMBOL_GPL(dma_free_noncontiguous); void *dma_vmap_noncontiguous(struct device *dev, size_t size, struct sg_table *sgt) { if (use_dma_iommu(dev)) return iommu_dma_vmap_noncontiguous(dev, size, sgt); return page_address(sg_page(sgt->sgl)); } EXPORT_SYMBOL_GPL(dma_vmap_noncontiguous); void dma_vunmap_noncontiguous(struct device *dev, void *vaddr) { if (use_dma_iommu(dev)) iommu_dma_vunmap_noncontiguous(dev, vaddr); } EXPORT_SYMBOL_GPL(dma_vunmap_noncontiguous); int dma_mmap_noncontiguous(struct device *dev, struct vm_area_struct *vma, size_t size, struct sg_table *sgt) { if (use_dma_iommu(dev)) return iommu_dma_mmap_noncontiguous(dev, vma, size, sgt); return dma_mmap_pages(dev, vma, size, sg_page(sgt->sgl)); } EXPORT_SYMBOL_GPL(dma_mmap_noncontiguous); static int dma_supported(struct device *dev, u64 mask) { const struct dma_map_ops *ops = get_dma_ops(dev); if (use_dma_iommu(dev)) { if (WARN_ON(ops)) return false; return true; } /* * ->dma_supported sets and clears the bypass flag, so ignore it here * and always call into the method if there is one. */ if (ops) { if (!ops->dma_supported) return true; return ops->dma_supported(dev, mask); } return dma_direct_supported(dev, mask); } bool dma_pci_p2pdma_supported(struct device *dev) { const struct dma_map_ops *ops = get_dma_ops(dev); /* * Note: dma_ops_bypass is not checked here because P2PDMA should * not be used with dma mapping ops that do not have support even * if the specific device is bypassing them. */ /* if ops is not set, dma direct and default IOMMU support P2PDMA */ return !ops; } EXPORT_SYMBOL_GPL(dma_pci_p2pdma_supported); int dma_set_mask(struct device *dev, u64 mask) { /* * Truncate the mask to the actually supported dma_addr_t width to * avoid generating unsupportable addresses. */ mask = (dma_addr_t)mask; if (!dev->dma_mask || !dma_supported(dev, mask)) return -EIO; arch_dma_set_mask(dev, mask); *dev->dma_mask = mask; dma_setup_need_sync(dev); return 0; } EXPORT_SYMBOL(dma_set_mask); int dma_set_coherent_mask(struct device *dev, u64 mask) { /* * Truncate the mask to the actually supported dma_addr_t width to * avoid generating unsupportable addresses. */ mask = (dma_addr_t)mask; if (!dma_supported(dev, mask)) return -EIO; dev->coherent_dma_mask = mask; return 0; } EXPORT_SYMBOL(dma_set_coherent_mask); static bool __dma_addressing_limited(struct device *dev) { const struct dma_map_ops *ops = get_dma_ops(dev); if (min_not_zero(dma_get_mask(dev), dev->bus_dma_limit) < dma_get_required_mask(dev)) return true; if (unlikely(ops) || use_dma_iommu(dev)) return false; return !dma_direct_all_ram_mapped(dev); } /** * dma_addressing_limited - return if the device is addressing limited * @dev: device to check * * Return %true if the devices DMA mask is too small to address all memory in * the system, else %false. Lack of addressing bits is the prime reason for * bounce buffering, but might not be the only one. */ bool dma_addressing_limited(struct device *dev) { if (!__dma_addressing_limited(dev)) return false; dev_dbg(dev, "device is DMA addressing limited\n"); return true; } EXPORT_SYMBOL_GPL(dma_addressing_limited); size_t dma_max_mapping_size(struct device *dev) { const struct dma_map_ops *ops = get_dma_ops(dev); size_t size = SIZE_MAX; if (dma_map_direct(dev, ops)) size = dma_direct_max_mapping_size(dev); else if (use_dma_iommu(dev)) size = iommu_dma_max_mapping_size(dev); else if (ops && ops->max_mapping_size) size = ops->max_mapping_size(dev); return size; } EXPORT_SYMBOL_GPL(dma_max_mapping_size); size_t dma_opt_mapping_size(struct device *dev) { const struct dma_map_ops *ops = get_dma_ops(dev); size_t size = SIZE_MAX; if (use_dma_iommu(dev)) size = iommu_dma_opt_mapping_size(); else if (ops && ops->opt_mapping_size) size = ops->opt_mapping_size(); return min(dma_max_mapping_size(dev), size); } EXPORT_SYMBOL_GPL(dma_opt_mapping_size); unsigned long dma_get_merge_boundary(struct device *dev) { const struct dma_map_ops *ops = get_dma_ops(dev); if (use_dma_iommu(dev)) return iommu_dma_get_merge_boundary(dev); if (!ops || !ops->get_merge_boundary) return 0; /* can't merge */ return ops->get_merge_boundary(dev); } EXPORT_SYMBOL_GPL(dma_get_merge_boundary);
161 111 110 6 1 5 5 2 3 10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 // SPDX-License-Identifier: GPL-2.0-or-later /* * V4L2 controls framework Request API implementation. * * Copyright (C) 2018-2021 Hans Verkuil <hverkuil@kernel.org> */ #define pr_fmt(fmt) "v4l2-ctrls: " fmt #include <linux/export.h> #include <linux/slab.h> #include <media/v4l2-ctrls.h> #include <media/v4l2-dev.h> #include <media/v4l2-ioctl.h> #include "v4l2-ctrls-priv.h" /* Initialize the request-related fields in a control handler */ void v4l2_ctrl_handler_init_request(struct v4l2_ctrl_handler *hdl) { INIT_LIST_HEAD(&hdl->requests); INIT_LIST_HEAD(&hdl->requests_queued); hdl->request_is_queued = false; media_request_object_init(&hdl->req_obj); } /* Free the request-related fields in a control handler */ void v4l2_ctrl_handler_free_request(struct v4l2_ctrl_handler *hdl) { struct v4l2_ctrl_handler *req, *next_req; /* * Do nothing if this isn't the main handler or the main * handler is not used in any request. * * The main handler can be identified by having a NULL ops pointer in * the request object. */ if (hdl->req_obj.ops || list_empty(&hdl->requests)) return; /* * If the main handler is freed and it is used by handler objects in * outstanding requests, then unbind and put those objects before * freeing the main handler. */ list_for_each_entry_safe(req, next_req, &hdl->requests, requests) { media_request_object_unbind(&req->req_obj); media_request_object_put(&req->req_obj); } } static int v4l2_ctrl_request_clone(struct v4l2_ctrl_handler *hdl, const struct v4l2_ctrl_handler *from) { struct v4l2_ctrl_ref *ref; int err = 0; if (WARN_ON(!hdl || hdl == from)) return -EINVAL; if (hdl->error) return hdl->error; WARN_ON(hdl->lock != &hdl->_lock); mutex_lock(from->lock); list_for_each_entry(ref, &from->ctrl_refs, node) { struct v4l2_ctrl *ctrl = ref->ctrl; struct v4l2_ctrl_ref *new_ref; /* Skip refs inherited from other devices */ if (ref->from_other_dev) continue; err = handler_new_ref(hdl, ctrl, &new_ref, false, true); if (err) break; } mutex_unlock(from->lock); return err; } static void v4l2_ctrl_request_queue(struct media_request_object *obj) { struct v4l2_ctrl_handler *hdl = container_of(obj, struct v4l2_ctrl_handler, req_obj); struct v4l2_ctrl_handler *main_hdl = obj->priv; mutex_lock(main_hdl->lock); list_add_tail(&hdl->requests_queued, &main_hdl->requests_queued); hdl->request_is_queued = true; mutex_unlock(main_hdl->lock); } static void v4l2_ctrl_request_unbind(struct media_request_object *obj) { struct v4l2_ctrl_handler *hdl = container_of(obj, struct v4l2_ctrl_handler, req_obj); struct v4l2_ctrl_handler *main_hdl = obj->priv; mutex_lock(main_hdl->lock); list_del_init(&hdl->requests); if (hdl->request_is_queued) { list_del_init(&hdl->requests_queued); hdl->request_is_queued = false; } mutex_unlock(main_hdl->lock); } static void v4l2_ctrl_request_release(struct media_request_object *obj) { struct v4l2_ctrl_handler *hdl = container_of(obj, struct v4l2_ctrl_handler, req_obj); v4l2_ctrl_handler_free(hdl); kfree(hdl); } static const struct media_request_object_ops req_ops = { .queue = v4l2_ctrl_request_queue, .unbind = v4l2_ctrl_request_unbind, .release = v4l2_ctrl_request_release, }; struct v4l2_ctrl_handler *v4l2_ctrl_request_hdl_find(struct media_request *req, struct v4l2_ctrl_handler *parent) { struct media_request_object *obj; if (WARN_ON(req->state != MEDIA_REQUEST_STATE_VALIDATING && req->state != MEDIA_REQUEST_STATE_QUEUED)) return NULL; obj = media_request_object_find(req, &req_ops, parent); if (obj) return container_of(obj, struct v4l2_ctrl_handler, req_obj); return NULL; } EXPORT_SYMBOL_GPL(v4l2_ctrl_request_hdl_find); struct v4l2_ctrl * v4l2_ctrl_request_hdl_ctrl_find(struct v4l2_ctrl_handler *hdl, u32 id) { struct v4l2_ctrl_ref *ref = find_ref_lock(hdl, id); return (ref && ref->p_req_valid) ? ref->ctrl : NULL; } EXPORT_SYMBOL_GPL(v4l2_ctrl_request_hdl_ctrl_find); static int v4l2_ctrl_request_bind(struct media_request *req, struct v4l2_ctrl_handler *hdl, struct v4l2_ctrl_handler *from) { int ret; ret = v4l2_ctrl_request_clone(hdl, from); if (!ret) { ret = media_request_object_bind(req, &req_ops, from, false, &hdl->req_obj); if (!ret) { mutex_lock(from->lock); list_add_tail(&hdl->requests, &from->requests); mutex_unlock(from->lock); } } return ret; } static struct media_request_object * v4l2_ctrls_find_req_obj(struct v4l2_ctrl_handler *hdl, struct media_request *req, bool set) { struct media_request_object *obj; struct v4l2_ctrl_handler *new_hdl; int ret; if (IS_ERR(req)) return ERR_CAST(req); if (set && WARN_ON(req->state != MEDIA_REQUEST_STATE_UPDATING)) return ERR_PTR(-EBUSY); obj = media_request_object_find(req, &req_ops, hdl); if (obj) return obj; /* * If there are no controls in this completed request, * then that can only happen if: * * 1) no controls were present in the queued request, and * 2) v4l2_ctrl_request_complete() could not allocate a * control handler object to store the completed state in. * * So return ENOMEM to indicate that there was an out-of-memory * error. */ if (!set) return ERR_PTR(-ENOMEM); new_hdl = kzalloc(sizeof(*new_hdl), GFP_KERNEL); if (!new_hdl) return ERR_PTR(-ENOMEM); obj = &new_hdl->req_obj; ret = v4l2_ctrl_handler_init(new_hdl, (hdl->nr_of_buckets - 1) * 8); if (!ret) ret = v4l2_ctrl_request_bind(req, new_hdl, hdl); if (ret) { v4l2_ctrl_handler_free(new_hdl); kfree(new_hdl); return ERR_PTR(ret); } media_request_object_get(obj); return obj; } int v4l2_g_ext_ctrls_request(struct v4l2_ctrl_handler *hdl, struct video_device *vdev, struct media_device *mdev, struct v4l2_ext_controls *cs) { struct media_request_object *obj = NULL; struct media_request *req = NULL; int ret; if (!mdev || cs->request_fd < 0) return -EINVAL; req = media_request_get_by_fd(mdev, cs->request_fd); if (IS_ERR(req)) return PTR_ERR(req); if (req->state != MEDIA_REQUEST_STATE_COMPLETE) { media_request_put(req); return -EACCES; } ret = media_request_lock_for_access(req); if (ret) { media_request_put(req); return ret; } obj = v4l2_ctrls_find_req_obj(hdl, req, false); if (IS_ERR(obj)) { media_request_unlock_for_access(req); media_request_put(req); return PTR_ERR(obj); } hdl = container_of(obj, struct v4l2_ctrl_handler, req_obj); ret = v4l2_g_ext_ctrls_common(hdl, cs, vdev); media_request_unlock_for_access(req); media_request_object_put(obj); media_request_put(req); return ret; } int try_set_ext_ctrls_request(struct v4l2_fh *fh, struct v4l2_ctrl_handler *hdl, struct video_device *vdev, struct media_device *mdev, struct v4l2_ext_controls *cs, bool set) { struct media_request_object *obj = NULL; struct media_request *req = NULL; int ret; if (!mdev) { dprintk(vdev, "%s: missing media device\n", video_device_node_name(vdev)); return -EINVAL; } if (cs->request_fd < 0) { dprintk(vdev, "%s: invalid request fd %d\n", video_device_node_name(vdev), cs->request_fd); return -EINVAL; } req = media_request_get_by_fd(mdev, cs->request_fd); if (IS_ERR(req)) { dprintk(vdev, "%s: cannot find request fd %d\n", video_device_node_name(vdev), cs->request_fd); return PTR_ERR(req); } ret = media_request_lock_for_update(req); if (ret) { dprintk(vdev, "%s: cannot lock request fd %d\n", video_device_node_name(vdev), cs->request_fd); media_request_put(req); return ret; } obj = v4l2_ctrls_find_req_obj(hdl, req, set); if (IS_ERR(obj)) { dprintk(vdev, "%s: cannot find request object for request fd %d\n", video_device_node_name(vdev), cs->request_fd); media_request_unlock_for_update(req); media_request_put(req); return PTR_ERR(obj); } hdl = container_of(obj, struct v4l2_ctrl_handler, req_obj); ret = try_set_ext_ctrls_common(fh, hdl, cs, vdev, set); if (ret) dprintk(vdev, "%s: try_set_ext_ctrls_common failed (%d)\n", video_device_node_name(vdev), ret); media_request_unlock_for_update(req); media_request_object_put(obj); media_request_put(req); return ret; } void v4l2_ctrl_request_complete(struct media_request *req, struct v4l2_ctrl_handler *main_hdl) { struct media_request_object *obj; struct v4l2_ctrl_handler *hdl; struct v4l2_ctrl_ref *ref; if (!req || !main_hdl) return; /* * Note that it is valid if nothing was found. It means * that this request doesn't have any controls and so just * wants to leave the controls unchanged. */ obj = media_request_object_find(req, &req_ops, main_hdl); if (!obj) { int ret; /* Create a new request so the driver can return controls */ hdl = kzalloc(sizeof(*hdl), GFP_KERNEL); if (!hdl) return; ret = v4l2_ctrl_handler_init(hdl, (main_hdl->nr_of_buckets - 1) * 8); if (!ret) ret = v4l2_ctrl_request_bind(req, hdl, main_hdl); if (ret) { v4l2_ctrl_handler_free(hdl); kfree(hdl); return; } hdl->request_is_queued = true; obj = media_request_object_find(req, &req_ops, main_hdl); } hdl = container_of(obj, struct v4l2_ctrl_handler, req_obj); list_for_each_entry(ref, &hdl->ctrl_refs, node) { struct v4l2_ctrl *ctrl = ref->ctrl; struct v4l2_ctrl *master = ctrl->cluster[0]; unsigned int i; if (ctrl->flags & V4L2_CTRL_FLAG_VOLATILE) { v4l2_ctrl_lock(master); /* g_volatile_ctrl will update the current control values */ for (i = 0; i < master->ncontrols; i++) cur_to_new(master->cluster[i]); call_op(master, g_volatile_ctrl); new_to_req(ref); v4l2_ctrl_unlock(master); continue; } if (ref->p_req_valid) continue; /* Copy the current control value into the request */ v4l2_ctrl_lock(ctrl); cur_to_req(ref); v4l2_ctrl_unlock(ctrl); } mutex_lock(main_hdl->lock); WARN_ON(!hdl->request_is_queued); list_del_init(&hdl->requests_queued); hdl->request_is_queued = false; mutex_unlock(main_hdl->lock); media_request_object_complete(obj); media_request_object_put(obj); } EXPORT_SYMBOL(v4l2_ctrl_request_complete); int v4l2_ctrl_request_setup(struct media_request *req, struct v4l2_ctrl_handler *main_hdl) { struct media_request_object *obj; struct v4l2_ctrl_handler *hdl; struct v4l2_ctrl_ref *ref; int ret = 0; if (!req || !main_hdl) return 0; if (WARN_ON(req->state != MEDIA_REQUEST_STATE_QUEUED)) return -EBUSY; /* * Note that it is valid if nothing was found. It means * that this request doesn't have any controls and so just * wants to leave the controls unchanged. */ obj = media_request_object_find(req, &req_ops, main_hdl); if (!obj) return 0; if (obj->completed) { media_request_object_put(obj); return -EBUSY; } hdl = container_of(obj, struct v4l2_ctrl_handler, req_obj); list_for_each_entry(ref, &hdl->ctrl_refs, node) ref->req_done = false; list_for_each_entry(ref, &hdl->ctrl_refs, node) { struct v4l2_ctrl *ctrl = ref->ctrl; struct v4l2_ctrl *master = ctrl->cluster[0]; bool have_new_data = false; int i; /* * Skip if this control was already handled by a cluster. * Skip button controls and read-only controls. */ if (ref->req_done || (ctrl->flags & V4L2_CTRL_FLAG_READ_ONLY)) continue; v4l2_ctrl_lock(master); for (i = 0; i < master->ncontrols; i++) { if (master->cluster[i]) { struct v4l2_ctrl_ref *r = find_ref(hdl, master->cluster[i]->id); if (r->p_req_valid) { have_new_data = true; break; } } } if (!have_new_data) { v4l2_ctrl_unlock(master); continue; } for (i = 0; i < master->ncontrols; i++) { if (master->cluster[i]) { struct v4l2_ctrl_ref *r = find_ref(hdl, master->cluster[i]->id); ret = req_to_new(r); if (ret) { v4l2_ctrl_unlock(master); goto error; } master->cluster[i]->is_new = 1; r->req_done = true; } } /* * For volatile autoclusters that are currently in auto mode * we need to discover if it will be set to manual mode. * If so, then we have to copy the current volatile values * first since those will become the new manual values (which * may be overwritten by explicit new values from this set * of controls). */ if (master->is_auto && master->has_volatiles && !is_cur_manual(master)) { s32 new_auto_val = *master->p_new.p_s32; /* * If the new value == the manual value, then copy * the current volatile values. */ if (new_auto_val == master->manual_mode_value) update_from_auto_cluster(master); } ret = try_or_set_cluster(NULL, master, true, 0); v4l2_ctrl_unlock(master); if (ret) break; } error: media_request_object_put(obj); return ret; } EXPORT_SYMBOL(v4l2_ctrl_request_setup);
17 8 4 11 10 8 62 4 57 38 38 38 65 65 65 65 65 64 64 41 27 64 10 5 10 43 43 42 4 43 23 20 19 43 48 48 44 17 17 17 14 3 28 31 31 31 31 31 4 4 3 1 2 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 // SPDX-License-Identifier: GPL-2.0-or-later /* * IPV6 GSO/GRO offload support * Linux INET6 implementation */ #include <linux/kernel.h> #include <linux/socket.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/printk.h> #include <net/protocol.h> #include <net/ipv6.h> #include <net/inet_common.h> #include <net/tcp.h> #include <net/udp.h> #include <net/gro.h> #include <net/gso.h> #include "ip6_offload.h" /* All GRO functions are always builtin, except UDP over ipv6, which lays in * ipv6 module, as it depends on UDPv6 lookup function, so we need special care * when ipv6 is built as a module */ #if IS_BUILTIN(CONFIG_IPV6) #define INDIRECT_CALL_L4(f, f2, f1, ...) INDIRECT_CALL_2(f, f2, f1, __VA_ARGS__) #else #define INDIRECT_CALL_L4(f, f2, f1, ...) INDIRECT_CALL_1(f, f2, __VA_ARGS__) #endif #define indirect_call_gro_receive_l4(f2, f1, cb, head, skb) \ ({ \ unlikely(gro_recursion_inc_test(skb)) ? \ NAPI_GRO_CB(skb)->flush |= 1, NULL : \ INDIRECT_CALL_L4(cb, f2, f1, head, skb); \ }) static int ipv6_gro_pull_exthdrs(struct sk_buff *skb, int off, int proto) { const struct net_offload *ops = NULL; struct ipv6_opt_hdr *opth; for (;;) { int len; ops = rcu_dereference(inet6_offloads[proto]); if (unlikely(!ops)) break; if (!(ops->flags & INET6_PROTO_GSO_EXTHDR)) break; opth = skb_gro_header(skb, off + sizeof(*opth), off); if (unlikely(!opth)) break; len = ipv6_optlen(opth); opth = skb_gro_header(skb, off + len, off); if (unlikely(!opth)) break; proto = opth->nexthdr; off += len; } skb_gro_pull(skb, off - skb_gro_receive_network_offset(skb)); return proto; } static int ipv6_gso_pull_exthdrs(struct sk_buff *skb, int proto) { const struct net_offload *ops = NULL; for (;;) { struct ipv6_opt_hdr *opth; int len; ops = rcu_dereference(inet6_offloads[proto]); if (unlikely(!ops)) break; if (!(ops->flags & INET6_PROTO_GSO_EXTHDR)) break; if (unlikely(!pskb_may_pull(skb, 8))) break; opth = (void *)skb->data; len = ipv6_optlen(opth); if (unlikely(!pskb_may_pull(skb, len))) break; opth = (void *)skb->data; proto = opth->nexthdr; __skb_pull(skb, len); } return proto; } static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EINVAL); struct ipv6hdr *ipv6h; const struct net_offload *ops; int proto, err; struct frag_hdr *fptr; unsigned int payload_len; u8 *prevhdr; int offset = 0; bool encap, udpfrag; int nhoff; bool gso_partial; skb_reset_network_header(skb); err = ipv6_hopopt_jumbo_remove(skb); if (err) return ERR_PTR(err); nhoff = skb_network_header(skb) - skb_mac_header(skb); if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h)))) goto out; encap = SKB_GSO_CB(skb)->encap_level > 0; if (encap) features &= skb->dev->hw_enc_features; SKB_GSO_CB(skb)->encap_level += sizeof(*ipv6h); ipv6h = ipv6_hdr(skb); __skb_pull(skb, sizeof(*ipv6h)); segs = ERR_PTR(-EPROTONOSUPPORT); proto = ipv6_gso_pull_exthdrs(skb, ipv6h->nexthdr); if (skb->encapsulation && skb_shinfo(skb)->gso_type & (SKB_GSO_IPXIP4 | SKB_GSO_IPXIP6)) udpfrag = proto == IPPROTO_UDP && encap && (skb_shinfo(skb)->gso_type & SKB_GSO_UDP); else udpfrag = proto == IPPROTO_UDP && !skb->encapsulation && (skb_shinfo(skb)->gso_type & SKB_GSO_UDP); ops = rcu_dereference(inet6_offloads[proto]); if (likely(ops && ops->callbacks.gso_segment)) { if (!skb_reset_transport_header_careful(skb)) goto out; segs = ops->callbacks.gso_segment(skb, features); if (!segs) skb->network_header = skb_mac_header(skb) + nhoff - skb->head; } if (IS_ERR_OR_NULL(segs)) goto out; gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL); for (skb = segs; skb; skb = skb->next) { ipv6h = (struct ipv6hdr *)(skb_mac_header(skb) + nhoff); if (gso_partial && skb_is_gso(skb)) payload_len = skb_shinfo(skb)->gso_size + SKB_GSO_CB(skb)->data_offset + skb->head - (unsigned char *)(ipv6h + 1); else payload_len = skb->len - nhoff - sizeof(*ipv6h); ipv6h->payload_len = htons(payload_len); skb->network_header = (u8 *)ipv6h - skb->head; skb_reset_mac_len(skb); if (udpfrag) { int err = ip6_find_1stfragopt(skb, &prevhdr); if (err < 0) { kfree_skb_list(segs); return ERR_PTR(err); } fptr = (struct frag_hdr *)((u8 *)ipv6h + err); fptr->frag_off = htons(offset); if (skb->next) fptr->frag_off |= htons(IP6_MF); offset += (ntohs(ipv6h->payload_len) - sizeof(struct frag_hdr)); } if (encap) skb_reset_inner_headers(skb); } out: return segs; } /* Return the total length of all the extension hdrs, following the same * logic in ipv6_gso_pull_exthdrs() when parsing ext-hdrs. */ static int ipv6_exthdrs_len(struct ipv6hdr *iph, const struct net_offload **opps) { struct ipv6_opt_hdr *opth = (void *)iph; int len = 0, proto, optlen = sizeof(*iph); proto = iph->nexthdr; for (;;) { *opps = rcu_dereference(inet6_offloads[proto]); if (unlikely(!(*opps))) break; if (!((*opps)->flags & INET6_PROTO_GSO_EXTHDR)) break; opth = (void *)opth + optlen; optlen = ipv6_optlen(opth); len += optlen; proto = opth->nexthdr; } return len; } INDIRECT_CALLABLE_SCOPE struct sk_buff *ipv6_gro_receive(struct list_head *head, struct sk_buff *skb) { const struct net_offload *ops; struct sk_buff *pp = NULL; struct sk_buff *p; struct ipv6hdr *iph; unsigned int nlen; unsigned int hlen; unsigned int off; u16 flush = 1; int proto; off = skb_gro_offset(skb); hlen = off + sizeof(*iph); iph = skb_gro_header(skb, hlen, off); if (unlikely(!iph)) goto out; NAPI_GRO_CB(skb)->network_offsets[NAPI_GRO_CB(skb)->encap_mark] = off; flush += ntohs(iph->payload_len) != skb->len - hlen; proto = iph->nexthdr; ops = rcu_dereference(inet6_offloads[proto]); if (!ops || !ops->callbacks.gro_receive) { proto = ipv6_gro_pull_exthdrs(skb, hlen, proto); ops = rcu_dereference(inet6_offloads[proto]); if (!ops || !ops->callbacks.gro_receive) goto out; iph = skb_gro_network_header(skb); } else { skb_gro_pull(skb, sizeof(*iph)); } skb_set_transport_header(skb, skb_gro_offset(skb)); NAPI_GRO_CB(skb)->proto = proto; flush--; nlen = skb_gro_offset(skb) - off; list_for_each_entry(p, head, list) { const struct ipv6hdr *iph2; __be32 first_word; /* <Version:4><Traffic_Class:8><Flow_Label:20> */ if (!NAPI_GRO_CB(p)->same_flow) continue; iph2 = (struct ipv6hdr *)(p->data + off); first_word = *(__be32 *)iph ^ *(__be32 *)iph2; /* All fields must match except length and Traffic Class. * XXX skbs on the gro_list have all been parsed and pulled * already so we don't need to compare nlen * (nlen != (sizeof(*iph2) + ipv6_exthdrs_len(iph2, &ops))) * memcmp() alone below is sufficient, right? */ if ((first_word & htonl(0xF00FFFFF)) || !ipv6_addr_equal(&iph->saddr, &iph2->saddr) || !ipv6_addr_equal(&iph->daddr, &iph2->daddr) || iph->nexthdr != iph2->nexthdr) { not_same_flow: NAPI_GRO_CB(p)->same_flow = 0; continue; } if (unlikely(nlen > sizeof(struct ipv6hdr))) { if (memcmp(iph + 1, iph2 + 1, nlen - sizeof(struct ipv6hdr))) goto not_same_flow; } } NAPI_GRO_CB(skb)->flush |= flush; skb_gro_postpull_rcsum(skb, iph, nlen); pp = indirect_call_gro_receive_l4(tcp6_gro_receive, udp6_gro_receive, ops->callbacks.gro_receive, head, skb); out: skb_gro_flush_final(skb, pp, flush); return pp; } static struct sk_buff *sit_ip6ip6_gro_receive(struct list_head *head, struct sk_buff *skb) { /* Common GRO receive for SIT and IP6IP6 */ if (NAPI_GRO_CB(skb)->encap_mark) { NAPI_GRO_CB(skb)->flush = 1; return NULL; } NAPI_GRO_CB(skb)->encap_mark = 1; return ipv6_gro_receive(head, skb); } static struct sk_buff *ip4ip6_gro_receive(struct list_head *head, struct sk_buff *skb) { /* Common GRO receive for SIT and IP6IP6 */ if (NAPI_GRO_CB(skb)->encap_mark) { NAPI_GRO_CB(skb)->flush = 1; return NULL; } NAPI_GRO_CB(skb)->encap_mark = 1; return inet_gro_receive(head, skb); } INDIRECT_CALLABLE_SCOPE int ipv6_gro_complete(struct sk_buff *skb, int nhoff) { const struct net_offload *ops; struct ipv6hdr *iph; int err = -ENOSYS; u32 payload_len; if (skb->encapsulation) { skb_set_inner_protocol(skb, cpu_to_be16(ETH_P_IPV6)); skb_set_inner_network_header(skb, nhoff); } payload_len = skb->len - nhoff - sizeof(*iph); if (unlikely(payload_len > IPV6_MAXPLEN)) { struct hop_jumbo_hdr *hop_jumbo; int hoplen = sizeof(*hop_jumbo); /* Move network header left */ memmove(skb_mac_header(skb) - hoplen, skb_mac_header(skb), skb->transport_header - skb->mac_header); skb->data -= hoplen; skb->len += hoplen; skb->mac_header -= hoplen; skb->network_header -= hoplen; iph = (struct ipv6hdr *)(skb->data + nhoff); hop_jumbo = (struct hop_jumbo_hdr *)(iph + 1); /* Build hop-by-hop options */ hop_jumbo->nexthdr = iph->nexthdr; hop_jumbo->hdrlen = 0; hop_jumbo->tlv_type = IPV6_TLV_JUMBO; hop_jumbo->tlv_len = 4; hop_jumbo->jumbo_payload_len = htonl(payload_len + hoplen); iph->nexthdr = NEXTHDR_HOP; iph->payload_len = 0; } else { iph = (struct ipv6hdr *)(skb->data + nhoff); iph->payload_len = htons(payload_len); } nhoff += sizeof(*iph) + ipv6_exthdrs_len(iph, &ops); if (WARN_ON(!ops || !ops->callbacks.gro_complete)) goto out; err = INDIRECT_CALL_L4(ops->callbacks.gro_complete, tcp6_gro_complete, udp6_gro_complete, skb, nhoff); out: return err; } static int sit_gro_complete(struct sk_buff *skb, int nhoff) { skb->encapsulation = 1; skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP4; return ipv6_gro_complete(skb, nhoff); } static int ip6ip6_gro_complete(struct sk_buff *skb, int nhoff) { skb->encapsulation = 1; skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP6; return ipv6_gro_complete(skb, nhoff); } static int ip4ip6_gro_complete(struct sk_buff *skb, int nhoff) { skb->encapsulation = 1; skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP6; return inet_gro_complete(skb, nhoff); } static struct sk_buff *sit_gso_segment(struct sk_buff *skb, netdev_features_t features) { if (!(skb_shinfo(skb)->gso_type & SKB_GSO_IPXIP4)) return ERR_PTR(-EINVAL); return ipv6_gso_segment(skb, features); } static struct sk_buff *ip4ip6_gso_segment(struct sk_buff *skb, netdev_features_t features) { if (!(skb_shinfo(skb)->gso_type & SKB_GSO_IPXIP6)) return ERR_PTR(-EINVAL); return inet_gso_segment(skb, features); } static struct sk_buff *ip6ip6_gso_segment(struct sk_buff *skb, netdev_features_t features) { if (!(skb_shinfo(skb)->gso_type & SKB_GSO_IPXIP6)) return ERR_PTR(-EINVAL); return ipv6_gso_segment(skb, features); } static const struct net_offload sit_offload = { .callbacks = { .gso_segment = sit_gso_segment, .gro_receive = sit_ip6ip6_gro_receive, .gro_complete = sit_gro_complete, }, }; static const struct net_offload ip4ip6_offload = { .callbacks = { .gso_segment = ip4ip6_gso_segment, .gro_receive = ip4ip6_gro_receive, .gro_complete = ip4ip6_gro_complete, }, }; static const struct net_offload ip6ip6_offload = { .callbacks = { .gso_segment = ip6ip6_gso_segment, .gro_receive = sit_ip6ip6_gro_receive, .gro_complete = ip6ip6_gro_complete, }, }; static int __init ipv6_offload_init(void) { if (tcpv6_offload_init() < 0) pr_crit("%s: Cannot add TCP protocol offload\n", __func__); if (ipv6_exthdrs_offload_init() < 0) pr_crit("%s: Cannot add EXTHDRS protocol offload\n", __func__); net_hotdata.ipv6_packet_offload = (struct packet_offload) { .type = cpu_to_be16(ETH_P_IPV6), .callbacks = { .gso_segment = ipv6_gso_segment, .gro_receive = ipv6_gro_receive, .gro_complete = ipv6_gro_complete, }, }; dev_add_offload(&net_hotdata.ipv6_packet_offload); inet_add_offload(&sit_offload, IPPROTO_IPV6); inet6_add_offload(&ip6ip6_offload, IPPROTO_IPV6); inet6_add_offload(&ip4ip6_offload, IPPROTO_IPIP); return 0; } fs_initcall(ipv6_offload_init);
31 31 31 31 31 31 27 4 31 29 18 11 3 26 29 6 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 // SPDX-License-Identifier: GPL-2.0-only /* * LED Class Core * * Copyright (C) 2005 John Lenz <lenz@cs.wisc.edu> * Copyright (C) 2005-2007 Richard Purdie <rpurdie@openedhand.com> */ #include <linux/ctype.h> #include <linux/device.h> #include <linux/err.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/leds.h> #include <linux/list.h> #include <linux/module.h> #include <linux/property.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/timer.h> #include <uapi/linux/uleds.h> #include <linux/of.h> #include "leds.h" static DEFINE_MUTEX(leds_lookup_lock); static LIST_HEAD(leds_lookup_list); static struct workqueue_struct *leds_wq; static ssize_t brightness_show(struct device *dev, struct device_attribute *attr, char *buf) { struct led_classdev *led_cdev = dev_get_drvdata(dev); unsigned int brightness; mutex_lock(&led_cdev->led_access); led_update_brightness(led_cdev); brightness = led_cdev->brightness; mutex_unlock(&led_cdev->led_access); return sprintf(buf, "%u\n", brightness); } static ssize_t brightness_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t size) { struct led_classdev *led_cdev = dev_get_drvdata(dev); unsigned long state; ssize_t ret; mutex_lock(&led_cdev->led_access); if (led_sysfs_is_disabled(led_cdev)) { ret = -EBUSY; goto unlock; } ret = kstrtoul(buf, 10, &state); if (ret) goto unlock; if (state == LED_OFF) led_trigger_remove(led_cdev); led_set_brightness(led_cdev, state); ret = size; unlock: mutex_unlock(&led_cdev->led_access); return ret; } static DEVICE_ATTR_RW(brightness); static ssize_t max_brightness_show(struct device *dev, struct device_attribute *attr, char *buf) { struct led_classdev *led_cdev = dev_get_drvdata(dev); unsigned int max_brightness; mutex_lock(&led_cdev->led_access); max_brightness = led_cdev->max_brightness; mutex_unlock(&led_cdev->led_access); return sprintf(buf, "%u\n", max_brightness); } static DEVICE_ATTR_RO(max_brightness); #ifdef CONFIG_LEDS_TRIGGERS static const BIN_ATTR(trigger, 0644, led_trigger_read, led_trigger_write, 0); static const struct bin_attribute *const led_trigger_bin_attrs[] = { &bin_attr_trigger, NULL, }; static const struct attribute_group led_trigger_group = { .bin_attrs = led_trigger_bin_attrs, }; #endif static struct attribute *led_class_attrs[] = { &dev_attr_brightness.attr, &dev_attr_max_brightness.attr, NULL, }; static const struct attribute_group led_group = { .attrs = led_class_attrs, }; static const struct attribute_group *led_groups[] = { &led_group, #ifdef CONFIG_LEDS_TRIGGERS &led_trigger_group, #endif NULL, }; #ifdef CONFIG_LEDS_BRIGHTNESS_HW_CHANGED static ssize_t brightness_hw_changed_show(struct device *dev, struct device_attribute *attr, char *buf) { struct led_classdev *led_cdev = dev_get_drvdata(dev); if (led_cdev->brightness_hw_changed == -1) return -ENODATA; return sprintf(buf, "%u\n", led_cdev->brightness_hw_changed); } static DEVICE_ATTR_RO(brightness_hw_changed); static int led_add_brightness_hw_changed(struct led_classdev *led_cdev) { struct device *dev = led_cdev->dev; int ret; ret = device_create_file(dev, &dev_attr_brightness_hw_changed); if (ret) { dev_err(dev, "Error creating brightness_hw_changed\n"); return ret; } led_cdev->brightness_hw_changed_kn = sysfs_get_dirent(dev->kobj.sd, "brightness_hw_changed"); if (!led_cdev->brightness_hw_changed_kn) { dev_err(dev, "Error getting brightness_hw_changed kn\n"); device_remove_file(dev, &dev_attr_brightness_hw_changed); return -ENXIO; } return 0; } static void led_remove_brightness_hw_changed(struct led_classdev *led_cdev) { sysfs_put(led_cdev->brightness_hw_changed_kn); device_remove_file(led_cdev->dev, &dev_attr_brightness_hw_changed); } void led_classdev_notify_brightness_hw_changed(struct led_classdev *led_cdev, unsigned int brightness) { if (WARN_ON(!led_cdev->brightness_hw_changed_kn)) return; led_cdev->brightness_hw_changed = brightness; sysfs_notify_dirent(led_cdev->brightness_hw_changed_kn); } EXPORT_SYMBOL_GPL(led_classdev_notify_brightness_hw_changed); #else static int led_add_brightness_hw_changed(struct led_classdev *led_cdev) { return 0; } static void led_remove_brightness_hw_changed(struct led_classdev *led_cdev) { } #endif /** * led_classdev_suspend - suspend an led_classdev. * @led_cdev: the led_classdev to suspend. */ void led_classdev_suspend(struct led_classdev *led_cdev) { led_cdev->flags |= LED_SUSPENDED; led_set_brightness_nopm(led_cdev, 0); flush_work(&led_cdev->set_brightness_work); } EXPORT_SYMBOL_GPL(led_classdev_suspend); /** * led_classdev_resume - resume an led_classdev. * @led_cdev: the led_classdev to resume. */ void led_classdev_resume(struct led_classdev *led_cdev) { led_set_brightness_nopm(led_cdev, led_cdev->brightness); if (led_cdev->flash_resume) led_cdev->flash_resume(led_cdev); led_cdev->flags &= ~LED_SUSPENDED; } EXPORT_SYMBOL_GPL(led_classdev_resume); #ifdef CONFIG_PM_SLEEP static int led_suspend(struct device *dev) { struct led_classdev *led_cdev = dev_get_drvdata(dev); if (led_cdev->flags & LED_CORE_SUSPENDRESUME) led_classdev_suspend(led_cdev); return 0; } static int led_resume(struct device *dev) { struct led_classdev *led_cdev = dev_get_drvdata(dev); if (led_cdev->flags & LED_CORE_SUSPENDRESUME) led_classdev_resume(led_cdev); return 0; } #endif static SIMPLE_DEV_PM_OPS(leds_class_dev_pm_ops, led_suspend, led_resume); static struct led_classdev *led_module_get(struct device *led_dev) { struct led_classdev *led_cdev; if (!led_dev) return ERR_PTR(-EPROBE_DEFER); led_cdev = dev_get_drvdata(led_dev); if (!try_module_get(led_cdev->dev->parent->driver->owner)) { put_device(led_cdev->dev); return ERR_PTR(-ENODEV); } return led_cdev; } static const struct class leds_class = { .name = "leds", .dev_groups = led_groups, .pm = &leds_class_dev_pm_ops, }; /** * of_led_get() - request a LED device via the LED framework * @np: device node to get the LED device from * @index: the index of the LED * @name: the name of the LED used to map it to its function, if present * * Returns the LED device parsed from the phandle specified in the "leds" * property of a device tree node or a negative error-code on failure. */ static struct led_classdev *of_led_get(struct device_node *np, int index, const char *name) { struct device *led_dev; struct device_node *led_node; /* * For named LEDs, first look up the name in the "led-names" property. * If it cannot be found, then of_parse_phandle() will propagate the error. */ if (name) index = of_property_match_string(np, "led-names", name); led_node = of_parse_phandle(np, "leds", index); if (!led_node) return ERR_PTR(-ENOENT); led_dev = class_find_device_by_of_node(&leds_class, led_node); of_node_put(led_node); return led_module_get(led_dev); } /** * led_put() - release a LED device * @led_cdev: LED device */ void led_put(struct led_classdev *led_cdev) { module_put(led_cdev->dev->parent->driver->owner); put_device(led_cdev->dev); } EXPORT_SYMBOL_GPL(led_put); static void devm_led_release(struct device *dev, void *res) { struct led_classdev **p = res; led_put(*p); } static struct led_classdev *__devm_led_get(struct device *dev, struct led_classdev *led) { struct led_classdev **dr; dr = devres_alloc(devm_led_release, sizeof(struct led_classdev *), GFP_KERNEL); if (!dr) { led_put(led); return ERR_PTR(-ENOMEM); } *dr = led; devres_add(dev, dr); return led; } /** * devm_of_led_get - Resource-managed request of a LED device * @dev: LED consumer * @index: index of the LED to obtain in the consumer * * The device node of the device is parse to find the request LED device. * The LED device returned from this function is automatically released * on driver detach. * * @return a pointer to a LED device or ERR_PTR(errno) on failure. */ struct led_classdev *__must_check devm_of_led_get(struct device *dev, int index) { struct led_classdev *led; if (!dev) return ERR_PTR(-EINVAL); led = of_led_get(dev->of_node, index, NULL); if (IS_ERR(led)) return led; return __devm_led_get(dev, led); } EXPORT_SYMBOL_GPL(devm_of_led_get); /** * led_get() - request a LED device via the LED framework * @dev: device for which to get the LED device * @con_id: name of the LED from the device's point of view * * @return a pointer to a LED device or ERR_PTR(errno) on failure. */ struct led_classdev *led_get(struct device *dev, char *con_id) { struct led_lookup_data *lookup; struct led_classdev *led_cdev; const char *provider = NULL; struct device *led_dev; led_cdev = of_led_get(dev->of_node, -1, con_id); if (!IS_ERR(led_cdev) || PTR_ERR(led_cdev) != -ENOENT) return led_cdev; mutex_lock(&leds_lookup_lock); list_for_each_entry(lookup, &leds_lookup_list, list) { if (!strcmp(lookup->dev_id, dev_name(dev)) && !strcmp(lookup->con_id, con_id)) { provider = kstrdup_const(lookup->provider, GFP_KERNEL); break; } } mutex_unlock(&leds_lookup_lock); if (!provider) return ERR_PTR(-ENOENT); led_dev = class_find_device_by_name(&leds_class, provider); kfree_const(provider); return led_module_get(led_dev); } EXPORT_SYMBOL_GPL(led_get); /** * devm_led_get() - request a LED device via the LED framework * @dev: device for which to get the LED device * @con_id: name of the LED from the device's point of view * * The LED device returned from this function is automatically released * on driver detach. * * @return a pointer to a LED device or ERR_PTR(errno) on failure. */ struct led_classdev *devm_led_get(struct device *dev, char *con_id) { struct led_classdev *led; led = led_get(dev, con_id); if (IS_ERR(led)) return led; return __devm_led_get(dev, led); } EXPORT_SYMBOL_GPL(devm_led_get); /** * led_add_lookup() - Add a LED lookup table entry * @led_lookup: the lookup table entry to add * * Add a LED lookup table entry. On systems without devicetree the lookup table * is used by led_get() to find LEDs. */ void led_add_lookup(struct led_lookup_data *led_lookup) { mutex_lock(&leds_lookup_lock); list_add_tail(&led_lookup->list, &leds_lookup_list); mutex_unlock(&leds_lookup_lock); } EXPORT_SYMBOL_GPL(led_add_lookup); /** * led_remove_lookup() - Remove a LED lookup table entry * @led_lookup: the lookup table entry to remove */ void led_remove_lookup(struct led_lookup_data *led_lookup) { mutex_lock(&leds_lookup_lock); list_del(&led_lookup->list); mutex_unlock(&leds_lookup_lock); } EXPORT_SYMBOL_GPL(led_remove_lookup); /** * devm_of_led_get_optional - Resource-managed request of an optional LED device * @dev: LED consumer * @index: index of the LED to obtain in the consumer * * The device node of the device is parsed to find the requested LED device. * The LED device returned from this function is automatically released * on driver detach. * * @return a pointer to a LED device, ERR_PTR(errno) on failure and NULL if the * led was not found. */ struct led_classdev *__must_check devm_of_led_get_optional(struct device *dev, int index) { struct led_classdev *led; led = devm_of_led_get(dev, index); if (IS_ERR(led) && PTR_ERR(led) == -ENOENT) return NULL; return led; } EXPORT_SYMBOL_GPL(devm_of_led_get_optional); static int led_classdev_next_name(const char *init_name, char *name, size_t len) { unsigned int i = 0; int ret = 0; struct device *dev; strscpy(name, init_name, len); while ((ret < len) && (dev = class_find_device_by_name(&leds_class, name))) { put_device(dev); ret = snprintf(name, len, "%s_%u", init_name, ++i); } if (ret >= len) return -ENOMEM; return i; } /** * led_classdev_register_ext - register a new object of led_classdev class * with init data. * * @parent: parent of LED device * @led_cdev: the led_classdev structure for this device. * @init_data: LED class device initialization data */ int led_classdev_register_ext(struct device *parent, struct led_classdev *led_cdev, struct led_init_data *init_data) { char composed_name[LED_MAX_NAME_SIZE]; char final_name[LED_MAX_NAME_SIZE]; const char *proposed_name = composed_name; int ret; if (init_data) { if (init_data->devname_mandatory && !init_data->devicename) { dev_err(parent, "Mandatory device name is missing"); return -EINVAL; } ret = led_compose_name(parent, init_data, composed_name); if (ret < 0) return ret; if (init_data->fwnode) { fwnode_property_read_string(init_data->fwnode, "linux,default-trigger", &led_cdev->default_trigger); if (fwnode_property_present(init_data->fwnode, "retain-state-shutdown")) led_cdev->flags |= LED_RETAIN_AT_SHUTDOWN; fwnode_property_read_u32(init_data->fwnode, "max-brightness", &led_cdev->max_brightness); if (fwnode_property_present(init_data->fwnode, "color")) fwnode_property_read_u32(init_data->fwnode, "color", &led_cdev->color); } } else { proposed_name = led_cdev->name; } ret = led_classdev_next_name(proposed_name, final_name, sizeof(final_name)); if (ret < 0) return ret; else if (ret && led_cdev->flags & LED_REJECT_NAME_CONFLICT) return -EEXIST; else if (ret) dev_warn(parent, "Led %s renamed to %s due to name collision\n", proposed_name, final_name); if (led_cdev->color >= LED_COLOR_ID_MAX) dev_warn(parent, "LED %s color identifier out of range\n", final_name); mutex_init(&led_cdev->led_access); mutex_lock(&led_cdev->led_access); led_cdev->dev = device_create_with_groups(&leds_class, parent, 0, led_cdev, led_cdev->groups, "%s", final_name); if (IS_ERR(led_cdev->dev)) { mutex_unlock(&led_cdev->led_access); return PTR_ERR(led_cdev->dev); } if (init_data && init_data->fwnode) device_set_node(led_cdev->dev, init_data->fwnode); if (led_cdev->flags & LED_BRIGHT_HW_CHANGED) { ret = led_add_brightness_hw_changed(led_cdev); if (ret) { device_unregister(led_cdev->dev); led_cdev->dev = NULL; mutex_unlock(&led_cdev->led_access); return ret; } } led_cdev->work_flags = 0; #ifdef CONFIG_LEDS_TRIGGERS init_rwsem(&led_cdev->trigger_lock); #endif #ifdef CONFIG_LEDS_BRIGHTNESS_HW_CHANGED led_cdev->brightness_hw_changed = -1; #endif /* add to the list of leds */ down_write(&leds_list_lock); list_add_tail(&led_cdev->node, &leds_list); up_write(&leds_list_lock); if (!led_cdev->max_brightness) led_cdev->max_brightness = LED_FULL; led_update_brightness(led_cdev); led_cdev->wq = leds_wq; led_init_core(led_cdev); #ifdef CONFIG_LEDS_TRIGGERS led_trigger_set_default(led_cdev); #endif mutex_unlock(&led_cdev->led_access); dev_dbg(parent, "Registered led device: %s\n", led_cdev->name); return 0; } EXPORT_SYMBOL_GPL(led_classdev_register_ext); /** * led_classdev_unregister - unregisters a object of led_properties class. * @led_cdev: the led device to unregister * * Unregisters a previously registered via led_classdev_register object. */ void led_classdev_unregister(struct led_classdev *led_cdev) { if (IS_ERR_OR_NULL(led_cdev->dev)) return; #ifdef CONFIG_LEDS_TRIGGERS down_write(&led_cdev->trigger_lock); if (led_cdev->trigger) led_trigger_set(led_cdev, NULL); up_write(&led_cdev->trigger_lock); #endif led_cdev->flags |= LED_UNREGISTERING; /* Stop blinking */ led_stop_software_blink(led_cdev); if (!(led_cdev->flags & LED_RETAIN_AT_SHUTDOWN)) led_set_brightness(led_cdev, LED_OFF); flush_work(&led_cdev->set_brightness_work); if (led_cdev->flags & LED_BRIGHT_HW_CHANGED) led_remove_brightness_hw_changed(led_cdev); device_unregister(led_cdev->dev); down_write(&leds_list_lock); list_del(&led_cdev->node); up_write(&leds_list_lock); mutex_destroy(&led_cdev->led_access); } EXPORT_SYMBOL_GPL(led_classdev_unregister); static void devm_led_classdev_release(struct device *dev, void *res) { led_classdev_unregister(*(struct led_classdev **)res); } /** * devm_led_classdev_register_ext - resource managed led_classdev_register_ext() * * @parent: parent of LED device * @led_cdev: the led_classdev structure for this device. * @init_data: LED class device initialization data */ int devm_led_classdev_register_ext(struct device *parent, struct led_classdev *led_cdev, struct led_init_data *init_data) { struct led_classdev **dr; int rc; dr = devres_alloc(devm_led_classdev_release, sizeof(*dr), GFP_KERNEL); if (!dr) return -ENOMEM; rc = led_classdev_register_ext(parent, led_cdev, init_data); if (rc) { devres_free(dr); return rc; } *dr = led_cdev; devres_add(parent, dr); return 0; } EXPORT_SYMBOL_GPL(devm_led_classdev_register_ext); static int devm_led_classdev_match(struct device *dev, void *res, void *data) { struct led_classdev **p = res; if (WARN_ON(!p || !*p)) return 0; return *p == data; } /** * devm_led_classdev_unregister() - resource managed led_classdev_unregister() * @dev: The device to unregister. * @led_cdev: the led_classdev structure for this device. */ void devm_led_classdev_unregister(struct device *dev, struct led_classdev *led_cdev) { WARN_ON(devres_release(dev, devm_led_classdev_release, devm_led_classdev_match, led_cdev)); } EXPORT_SYMBOL_GPL(devm_led_classdev_unregister); static int __init leds_init(void) { leds_wq = alloc_ordered_workqueue("leds", 0); if (!leds_wq) { pr_err("Failed to create LEDs ordered workqueue\n"); return -ENOMEM; } return class_register(&leds_class); } static void __exit leds_exit(void) { class_unregister(&leds_class); destroy_workqueue(leds_wq); } subsys_initcall(leds_init); module_exit(leds_exit); MODULE_AUTHOR("John Lenz, Richard Purdie"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("LED Class Interface");
32513 32518 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_LOCAL_LOCK_H # error "Do not include directly, include linux/local_lock.h" #endif #include <linux/percpu-defs.h> #include <linux/lockdep.h> #ifndef CONFIG_PREEMPT_RT typedef struct { #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; struct task_struct *owner; #endif } local_lock_t; /* local_trylock() and local_trylock_irqsave() only work with local_trylock_t */ typedef struct { #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; struct task_struct *owner; #endif u8 acquired; } local_trylock_t; #ifdef CONFIG_DEBUG_LOCK_ALLOC # define LOCAL_LOCK_DEBUG_INIT(lockname) \ .dep_map = { \ .name = #lockname, \ .wait_type_inner = LD_WAIT_CONFIG, \ .lock_type = LD_LOCK_PERCPU, \ }, \ .owner = NULL, # define LOCAL_TRYLOCK_DEBUG_INIT(lockname) \ LOCAL_LOCK_DEBUG_INIT(lockname) static inline void local_lock_acquire(local_lock_t *l) { lock_map_acquire(&l->dep_map); DEBUG_LOCKS_WARN_ON(l->owner); l->owner = current; } static inline void local_trylock_acquire(local_lock_t *l) { lock_map_acquire_try(&l->dep_map); DEBUG_LOCKS_WARN_ON(l->owner); l->owner = current; } static inline void local_lock_release(local_lock_t *l) { DEBUG_LOCKS_WARN_ON(l->owner != current); l->owner = NULL; lock_map_release(&l->dep_map); } static inline void local_lock_debug_init(local_lock_t *l) { l->owner = NULL; } #else /* CONFIG_DEBUG_LOCK_ALLOC */ # define LOCAL_LOCK_DEBUG_INIT(lockname) # define LOCAL_TRYLOCK_DEBUG_INIT(lockname) static inline void local_lock_acquire(local_lock_t *l) { } static inline void local_trylock_acquire(local_lock_t *l) { } static inline void local_lock_release(local_lock_t *l) { } static inline void local_lock_debug_init(local_lock_t *l) { } #endif /* !CONFIG_DEBUG_LOCK_ALLOC */ #define INIT_LOCAL_LOCK(lockname) { LOCAL_LOCK_DEBUG_INIT(lockname) } #define INIT_LOCAL_TRYLOCK(lockname) { LOCAL_TRYLOCK_DEBUG_INIT(lockname) } #define __local_lock_init(lock) \ do { \ static struct lock_class_key __key; \ \ debug_check_no_locks_freed((void *)lock, sizeof(*lock));\ lockdep_init_map_type(&(lock)->dep_map, #lock, &__key, \ 0, LD_WAIT_CONFIG, LD_WAIT_INV, \ LD_LOCK_PERCPU); \ local_lock_debug_init(lock); \ } while (0) #define __local_trylock_init(lock) __local_lock_init((local_lock_t *)lock) #define __spinlock_nested_bh_init(lock) \ do { \ static struct lock_class_key __key; \ \ debug_check_no_locks_freed((void *)lock, sizeof(*lock));\ lockdep_init_map_type(&(lock)->dep_map, #lock, &__key, \ 0, LD_WAIT_CONFIG, LD_WAIT_INV, \ LD_LOCK_NORMAL); \ local_lock_debug_init(lock); \ } while (0) #define __local_lock_acquire(lock) \ do { \ local_trylock_t *tl; \ local_lock_t *l; \ \ l = (local_lock_t *)(lock); \ tl = (local_trylock_t *)l; \ _Generic((lock), \ local_trylock_t *: ({ \ lockdep_assert(tl->acquired == 0); \ WRITE_ONCE(tl->acquired, 1); \ }), \ local_lock_t *: (void)0); \ local_lock_acquire(l); \ } while (0) #define __local_lock(lock) \ do { \ preempt_disable(); \ __local_lock_acquire(lock); \ } while (0) #define __local_lock_irq(lock) \ do { \ local_irq_disable(); \ __local_lock_acquire(lock); \ } while (0) #define __local_lock_irqsave(lock, flags) \ do { \ local_irq_save(flags); \ __local_lock_acquire(lock); \ } while (0) #define __local_trylock(lock) \ ({ \ local_trylock_t *tl; \ \ preempt_disable(); \ tl = (lock); \ if (READ_ONCE(tl->acquired)) { \ preempt_enable(); \ tl = NULL; \ } else { \ WRITE_ONCE(tl->acquired, 1); \ local_trylock_acquire( \ (local_lock_t *)tl); \ } \ !!tl; \ }) #define __local_trylock_irqsave(lock, flags) \ ({ \ local_trylock_t *tl; \ \ local_irq_save(flags); \ tl = (lock); \ if (READ_ONCE(tl->acquired)) { \ local_irq_restore(flags); \ tl = NULL; \ } else { \ WRITE_ONCE(tl->acquired, 1); \ local_trylock_acquire( \ (local_lock_t *)tl); \ } \ !!tl; \ }) /* preemption or migration must be disabled before calling __local_lock_is_locked */ #define __local_lock_is_locked(lock) READ_ONCE(this_cpu_ptr(lock)->acquired) #define __local_lock_release(lock) \ do { \ local_trylock_t *tl; \ local_lock_t *l; \ \ l = (local_lock_t *)(lock); \ tl = (local_trylock_t *)l; \ local_lock_release(l); \ _Generic((lock), \ local_trylock_t *: ({ \ lockdep_assert(tl->acquired == 1); \ WRITE_ONCE(tl->acquired, 0); \ }), \ local_lock_t *: (void)0); \ } while (0) #define __local_unlock(lock) \ do { \ __local_lock_release(lock); \ preempt_enable(); \ } while (0) #define __local_unlock_irq(lock) \ do { \ __local_lock_release(lock); \ local_irq_enable(); \ } while (0) #define __local_unlock_irqrestore(lock, flags) \ do { \ __local_lock_release(lock); \ local_irq_restore(flags); \ } while (0) #define __local_lock_nested_bh(lock) \ do { \ lockdep_assert_in_softirq(); \ local_lock_acquire((lock)); \ } while (0) #define __local_unlock_nested_bh(lock) \ local_lock_release((lock)) #else /* !CONFIG_PREEMPT_RT */ /* * On PREEMPT_RT local_lock maps to a per CPU spinlock, which protects the * critical section while staying preemptible. */ typedef spinlock_t local_lock_t; typedef spinlock_t local_trylock_t; #define INIT_LOCAL_LOCK(lockname) __LOCAL_SPIN_LOCK_UNLOCKED((lockname)) #define INIT_LOCAL_TRYLOCK(lockname) __LOCAL_SPIN_LOCK_UNLOCKED((lockname)) #define __local_lock_init(l) \ do { \ local_spin_lock_init((l)); \ } while (0) #define __local_trylock_init(l) __local_lock_init(l) #define __local_lock(__lock) \ do { \ migrate_disable(); \ spin_lock((__lock)); \ } while (0) #define __local_lock_irq(lock) __local_lock(lock) #define __local_lock_irqsave(lock, flags) \ do { \ typecheck(unsigned long, flags); \ flags = 0; \ __local_lock(lock); \ } while (0) #define __local_unlock(__lock) \ do { \ spin_unlock((__lock)); \ migrate_enable(); \ } while (0) #define __local_unlock_irq(lock) __local_unlock(lock) #define __local_unlock_irqrestore(lock, flags) __local_unlock(lock) #define __local_lock_nested_bh(lock) \ do { \ lockdep_assert_in_softirq_func(); \ spin_lock((lock)); \ } while (0) #define __local_unlock_nested_bh(lock) \ do { \ spin_unlock((lock)); \ } while (0) #define __local_trylock(lock) \ ({ \ int __locked; \ \ if (in_nmi() | in_hardirq()) { \ __locked = 0; \ } else { \ migrate_disable(); \ __locked = spin_trylock((lock)); \ if (!__locked) \ migrate_enable(); \ } \ __locked; \ }) #define __local_trylock_irqsave(lock, flags) \ ({ \ typecheck(unsigned long, flags); \ flags = 0; \ __local_trylock(lock); \ }) /* migration must be disabled before calling __local_lock_is_locked */ #define __local_lock_is_locked(__lock) \ (rt_mutex_owner(&this_cpu_ptr(__lock)->lock) == current) #endif /* CONFIG_PREEMPT_RT */
5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ELEVATOR_H #define _ELEVATOR_H #include <linux/percpu.h> #include <linux/hashtable.h> #include "blk-mq.h" struct io_cq; struct elevator_type; struct blk_mq_debugfs_attr; /* * Return values from elevator merger */ enum elv_merge { ELEVATOR_NO_MERGE = 0, ELEVATOR_FRONT_MERGE = 1, ELEVATOR_BACK_MERGE = 2, ELEVATOR_DISCARD_MERGE = 3, }; struct blk_mq_alloc_data; struct blk_mq_hw_ctx; struct elevator_tags { /* num. of hardware queues for which tags are allocated */ unsigned int nr_hw_queues; /* depth used while allocating tags */ unsigned int nr_requests; /* shared tag is stored at index 0 */ struct blk_mq_tags *tags[]; }; struct elevator_mq_ops { int (*init_sched)(struct request_queue *, struct elevator_queue *); void (*exit_sched)(struct elevator_queue *); int (*init_hctx)(struct blk_mq_hw_ctx *, unsigned int); void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int); void (*depth_updated)(struct request_queue *); bool (*allow_merge)(struct request_queue *, struct request *, struct bio *); bool (*bio_merge)(struct request_queue *, struct bio *, unsigned int); int (*request_merge)(struct request_queue *q, struct request **, struct bio *); void (*request_merged)(struct request_queue *, struct request *, enum elv_merge); void (*requests_merged)(struct request_queue *, struct request *, struct request *); void (*limit_depth)(blk_opf_t, struct blk_mq_alloc_data *); void (*prepare_request)(struct request *); void (*finish_request)(struct request *); void (*insert_requests)(struct blk_mq_hw_ctx *hctx, struct list_head *list, blk_insert_t flags); struct request *(*dispatch_request)(struct blk_mq_hw_ctx *); bool (*has_work)(struct blk_mq_hw_ctx *); void (*completed_request)(struct request *, u64); void (*requeue_request)(struct request *); struct request *(*former_request)(struct request_queue *, struct request *); struct request *(*next_request)(struct request_queue *, struct request *); void (*init_icq)(struct io_cq *); void (*exit_icq)(struct io_cq *); }; #define ELV_NAME_MAX (16) struct elv_fs_entry { struct attribute attr; ssize_t (*show)(struct elevator_queue *, char *); ssize_t (*store)(struct elevator_queue *, const char *, size_t); }; /* * identifies an elevator type, such as AS or deadline */ struct elevator_type { /* managed by elevator core */ struct kmem_cache *icq_cache; /* fields provided by elevator implementation */ struct elevator_mq_ops ops; size_t icq_size; /* see iocontext.h */ size_t icq_align; /* ditto */ const struct elv_fs_entry *elevator_attrs; const char *elevator_name; const char *elevator_alias; struct module *elevator_owner; #ifdef CONFIG_BLK_DEBUG_FS const struct blk_mq_debugfs_attr *queue_debugfs_attrs; const struct blk_mq_debugfs_attr *hctx_debugfs_attrs; #endif /* managed by elevator core */ char icq_cache_name[ELV_NAME_MAX + 6]; /* elvname + "_io_cq" */ struct list_head list; }; static inline bool elevator_tryget(struct elevator_type *e) { return try_module_get(e->elevator_owner); } static inline void __elevator_get(struct elevator_type *e) { __module_get(e->elevator_owner); } static inline void elevator_put(struct elevator_type *e) { module_put(e->elevator_owner); } #define ELV_HASH_BITS 6 void elv_rqhash_del(struct request_queue *q, struct request *rq); void elv_rqhash_add(struct request_queue *q, struct request *rq); void elv_rqhash_reposition(struct request_queue *q, struct request *rq); struct request *elv_rqhash_find(struct request_queue *q, sector_t offset); /* * each queue has an elevator_queue associated with it */ struct elevator_queue { struct elevator_type *type; struct elevator_tags *et; void *elevator_data; struct kobject kobj; struct mutex sysfs_lock; unsigned long flags; DECLARE_HASHTABLE(hash, ELV_HASH_BITS); }; #define ELEVATOR_FLAG_REGISTERED 0 #define ELEVATOR_FLAG_DYING 1 #define ELEVATOR_FLAG_ENABLE_WBT_ON_EXIT 2 /* * block elevator interface */ extern enum elv_merge elv_merge(struct request_queue *, struct request **, struct bio *); extern void elv_merge_requests(struct request_queue *, struct request *, struct request *); extern void elv_merged_request(struct request_queue *, struct request *, enum elv_merge); extern bool elv_attempt_insert_merge(struct request_queue *, struct request *, struct list_head *); extern struct request *elv_former_request(struct request_queue *, struct request *); extern struct request *elv_latter_request(struct request_queue *, struct request *); void elevator_init_mq(struct request_queue *q); /* * io scheduler registration */ extern int elv_register(struct elevator_type *); extern void elv_unregister(struct elevator_type *); /* * io scheduler sysfs switching */ ssize_t elv_iosched_show(struct gendisk *disk, char *page); ssize_t elv_iosched_store(struct gendisk *disk, const char *page, size_t count); extern bool elv_bio_merge_ok(struct request *, struct bio *); struct elevator_queue *elevator_alloc(struct request_queue *, struct elevator_type *, struct elevator_tags *); /* * Helper functions. */ extern struct request *elv_rb_former_request(struct request_queue *, struct request *); extern struct request *elv_rb_latter_request(struct request_queue *, struct request *); /* * rb support functions. */ extern void elv_rb_add(struct rb_root *, struct request *); extern void elv_rb_del(struct rb_root *, struct request *); extern struct request *elv_rb_find(struct rb_root *, sector_t); /* * Insertion selection */ #define ELEVATOR_INSERT_FRONT 1 #define ELEVATOR_INSERT_BACK 2 #define ELEVATOR_INSERT_SORT 3 #define ELEVATOR_INSERT_REQUEUE 4 #define ELEVATOR_INSERT_FLUSH 5 #define ELEVATOR_INSERT_SORT_MERGE 6 #define rb_entry_rq(node) rb_entry((node), struct request, rb_node) #define rq_entry_fifo(ptr) list_entry((ptr), struct request, queuelist) #define rq_fifo_clear(rq) list_del_init(&(rq)->queuelist) void blk_mq_sched_reg_debugfs(struct request_queue *q); void blk_mq_sched_unreg_debugfs(struct request_queue *q); #endif /* _ELEVATOR_H */
4 180 89 1 259 261 9 70 78 79 79 79 260 79 261 79 263 264 182 181 1 259 261 3 264 263 262 6965 6916 79 6912 6925 24 78 79 5 5 5 5 5 5 5 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 // SPDX-License-Identifier: GPL-2.0 /* * blk-mq scheduling framework * * Copyright (C) 2016 Jens Axboe */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/list_sort.h> #include <trace/events/block.h> #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" #include "blk-mq-sched.h" #include "blk-wbt.h" /* * Mark a hardware queue as needing a restart. */ void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx) { if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) return; set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); } EXPORT_SYMBOL_GPL(blk_mq_sched_mark_restart_hctx); void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) { clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); /* * Order clearing SCHED_RESTART and list_empty_careful(&hctx->dispatch) * in blk_mq_run_hw_queue(). Its pair is the barrier in * blk_mq_dispatch_rq_list(). So dispatch code won't see SCHED_RESTART, * meantime new request added to hctx->dispatch is missed to check in * blk_mq_run_hw_queue(). */ smp_mb(); blk_mq_run_hw_queue(hctx, true); } static int sched_rq_cmp(void *priv, const struct list_head *a, const struct list_head *b) { struct request *rqa = container_of(a, struct request, queuelist); struct request *rqb = container_of(b, struct request, queuelist); return rqa->mq_hctx > rqb->mq_hctx; } static bool blk_mq_dispatch_hctx_list(struct list_head *rq_list) { struct blk_mq_hw_ctx *hctx = list_first_entry(rq_list, struct request, queuelist)->mq_hctx; struct request *rq; LIST_HEAD(hctx_list); list_for_each_entry(rq, rq_list, queuelist) { if (rq->mq_hctx != hctx) { list_cut_before(&hctx_list, rq_list, &rq->queuelist); goto dispatch; } } list_splice_tail_init(rq_list, &hctx_list); dispatch: return blk_mq_dispatch_rq_list(hctx, &hctx_list, false); } #define BLK_MQ_BUDGET_DELAY 3 /* ms units */ /* * Only SCSI implements .get_budget and .put_budget, and SCSI restarts * its queue by itself in its completion handler, so we don't need to * restart queue if .get_budget() fails to get the budget. * * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to * be run again. This is necessary to avoid starving flushes. */ static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; struct elevator_queue *e = q->elevator; bool multi_hctxs = false, run_queue = false; bool dispatched = false, busy = false; unsigned int max_dispatch; LIST_HEAD(rq_list); int count = 0; if (hctx->dispatch_busy) max_dispatch = 1; else max_dispatch = hctx->queue->nr_requests; do { struct request *rq; int budget_token; if (e->type->ops.has_work && !e->type->ops.has_work(hctx)) break; if (!list_empty_careful(&hctx->dispatch)) { busy = true; break; } budget_token = blk_mq_get_dispatch_budget(q); if (budget_token < 0) break; rq = e->type->ops.dispatch_request(hctx); if (!rq) { blk_mq_put_dispatch_budget(q, budget_token); /* * We're releasing without dispatching. Holding the * budget could have blocked any "hctx"s with the * same queue and if we didn't dispatch then there's * no guarantee anyone will kick the queue. Kick it * ourselves. */ run_queue = true; break; } blk_mq_set_rq_budget_token(rq, budget_token); /* * Now this rq owns the budget which has to be released * if this rq won't be queued to driver via .queue_rq() * in blk_mq_dispatch_rq_list(). */ list_add_tail(&rq->queuelist, &rq_list); count++; if (rq->mq_hctx != hctx) multi_hctxs = true; /* * If we cannot get tag for the request, stop dequeueing * requests from the IO scheduler. We are unlikely to be able * to submit them anyway and it creates false impression for * scheduling heuristics that the device can take more IO. */ if (!blk_mq_get_driver_tag(rq)) break; } while (count < max_dispatch); if (!count) { if (run_queue) blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY); } else if (multi_hctxs) { /* * Requests from different hctx may be dequeued from some * schedulers, such as bfq and deadline. * * Sort the requests in the list according to their hctx, * dispatch batching requests from same hctx at a time. */ list_sort(NULL, &rq_list, sched_rq_cmp); do { dispatched |= blk_mq_dispatch_hctx_list(&rq_list); } while (!list_empty(&rq_list)); } else { dispatched = blk_mq_dispatch_rq_list(hctx, &rq_list, false); } if (busy) return -EAGAIN; return !!dispatched; } static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) { unsigned long end = jiffies + HZ; int ret; do { ret = __blk_mq_do_dispatch_sched(hctx); if (ret != 1) break; if (need_resched() || time_is_before_jiffies(end)) { blk_mq_delay_run_hw_queue(hctx, 0); break; } } while (1); return ret; } static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx) { unsigned short idx = ctx->index_hw[hctx->type]; if (++idx == hctx->nr_ctx) idx = 0; return hctx->ctxs[idx]; } /* * Only SCSI implements .get_budget and .put_budget, and SCSI restarts * its queue by itself in its completion handler, so we don't need to * restart queue if .get_budget() fails to get the budget. * * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to * be run again. This is necessary to avoid starving flushes. */ static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; LIST_HEAD(rq_list); struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from); int ret = 0; struct request *rq; do { int budget_token; if (!list_empty_careful(&hctx->dispatch)) { ret = -EAGAIN; break; } if (!sbitmap_any_bit_set(&hctx->ctx_map)) break; budget_token = blk_mq_get_dispatch_budget(q); if (budget_token < 0) break; rq = blk_mq_dequeue_from_ctx(hctx, ctx); if (!rq) { blk_mq_put_dispatch_budget(q, budget_token); /* * We're releasing without dispatching. Holding the * budget could have blocked any "hctx"s with the * same queue and if we didn't dispatch then there's * no guarantee anyone will kick the queue. Kick it * ourselves. */ blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY); break; } blk_mq_set_rq_budget_token(rq, budget_token); /* * Now this rq owns the budget which has to be released * if this rq won't be queued to driver via .queue_rq() * in blk_mq_dispatch_rq_list(). */ list_add(&rq->queuelist, &rq_list); /* round robin for fair dispatch */ ctx = blk_mq_next_ctx(hctx, rq->mq_ctx); } while (blk_mq_dispatch_rq_list(rq->mq_hctx, &rq_list, false)); WRITE_ONCE(hctx->dispatch_from, ctx); return ret; } static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) { bool need_dispatch = false; LIST_HEAD(rq_list); /* * If we have previous entries on our dispatch list, grab them first for * more fair dispatch. */ if (!list_empty_careful(&hctx->dispatch)) { spin_lock(&hctx->lock); if (!list_empty(&hctx->dispatch)) list_splice_init(&hctx->dispatch, &rq_list); spin_unlock(&hctx->lock); } /* * Only ask the scheduler for requests, if we didn't have residual * requests from the dispatch list. This is to avoid the case where * we only ever dispatch a fraction of the requests available because * of low device queue depth. Once we pull requests out of the IO * scheduler, we can no longer merge or sort them. So it's best to * leave them there for as long as we can. Mark the hw queue as * needing a restart in that case. * * We want to dispatch from the scheduler if there was nothing * on the dispatch list or we were able to dispatch from the * dispatch list. */ if (!list_empty(&rq_list)) { blk_mq_sched_mark_restart_hctx(hctx); if (!blk_mq_dispatch_rq_list(hctx, &rq_list, true)) return 0; need_dispatch = true; } else { need_dispatch = hctx->dispatch_busy; } if (hctx->queue->elevator) return blk_mq_do_dispatch_sched(hctx); /* dequeue request one by one from sw queue if queue is busy */ if (need_dispatch) return blk_mq_do_dispatch_ctx(hctx); blk_mq_flush_busy_ctxs(hctx, &rq_list); blk_mq_dispatch_rq_list(hctx, &rq_list, true); return 0; } void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; /* RCU or SRCU read lock is needed before checking quiesced flag */ if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q))) return; /* * A return of -EAGAIN is an indication that hctx->dispatch is not * empty and we must run again in order to avoid starving flushes. */ if (__blk_mq_sched_dispatch_requests(hctx) == -EAGAIN) { if (__blk_mq_sched_dispatch_requests(hctx) == -EAGAIN) blk_mq_run_hw_queue(hctx, true); } } bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs) { struct elevator_queue *e = q->elevator; struct blk_mq_ctx *ctx; struct blk_mq_hw_ctx *hctx; bool ret = false; enum hctx_type type; if (e && e->type->ops.bio_merge) { ret = e->type->ops.bio_merge(q, bio, nr_segs); goto out_put; } ctx = blk_mq_get_ctx(q); hctx = blk_mq_map_queue(bio->bi_opf, ctx); type = hctx->type; if (list_empty_careful(&ctx->rq_lists[type])) goto out_put; /* default per sw-queue merge */ spin_lock(&ctx->lock); /* * Reverse check our software queue for entries that we could * potentially merge with. Currently includes a hand-wavy stop * count of 8, to not spend too much time checking for merges. */ if (blk_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs)) ret = true; spin_unlock(&ctx->lock); out_put: return ret; } bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq, struct list_head *free) { return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq, free); } EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge); /* called in queue's release handler, tagset has gone away */ static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int flags) { struct blk_mq_hw_ctx *hctx; unsigned long i; queue_for_each_hw_ctx(q, hctx, i) hctx->sched_tags = NULL; if (blk_mq_is_shared_tags(flags)) q->sched_shared_tags = NULL; } void blk_mq_sched_reg_debugfs(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; unsigned long i; mutex_lock(&q->debugfs_mutex); blk_mq_debugfs_register_sched(q); queue_for_each_hw_ctx(q, hctx, i) blk_mq_debugfs_register_sched_hctx(q, hctx); mutex_unlock(&q->debugfs_mutex); } void blk_mq_sched_unreg_debugfs(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; unsigned long i; mutex_lock(&q->debugfs_mutex); queue_for_each_hw_ctx(q, hctx, i) blk_mq_debugfs_unregister_sched_hctx(hctx); blk_mq_debugfs_unregister_sched(q); mutex_unlock(&q->debugfs_mutex); } void blk_mq_free_sched_tags(struct elevator_tags *et, struct blk_mq_tag_set *set) { unsigned long i; /* Shared tags are stored at index 0 in @tags. */ if (blk_mq_is_shared_tags(set->flags)) blk_mq_free_map_and_rqs(set, et->tags[0], BLK_MQ_NO_HCTX_IDX); else { for (i = 0; i < et->nr_hw_queues; i++) blk_mq_free_map_and_rqs(set, et->tags[i], i); } kfree(et); } void blk_mq_free_sched_tags_batch(struct xarray *et_table, struct blk_mq_tag_set *set) { struct request_queue *q; struct elevator_tags *et; lockdep_assert_held_write(&set->update_nr_hwq_lock); list_for_each_entry(q, &set->tag_list, tag_set_list) { /* * Accessing q->elevator without holding q->elevator_lock is * safe because we're holding here set->update_nr_hwq_lock in * the writer context. So, scheduler update/switch code (which * acquires the same lock but in the reader context) can't run * concurrently. */ if (q->elevator) { et = xa_load(et_table, q->id); if (unlikely(!et)) WARN_ON_ONCE(1); else blk_mq_free_sched_tags(et, set); } } } struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set, unsigned int nr_hw_queues, unsigned int nr_requests) { unsigned int nr_tags; int i; struct elevator_tags *et; gfp_t gfp = GFP_NOIO | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY; if (blk_mq_is_shared_tags(set->flags)) nr_tags = 1; else nr_tags = nr_hw_queues; et = kmalloc(sizeof(struct elevator_tags) + nr_tags * sizeof(struct blk_mq_tags *), gfp); if (!et) return NULL; et->nr_requests = nr_requests; et->nr_hw_queues = nr_hw_queues; if (blk_mq_is_shared_tags(set->flags)) { /* Shared tags are stored at index 0 in @tags. */ et->tags[0] = blk_mq_alloc_map_and_rqs(set, BLK_MQ_NO_HCTX_IDX, MAX_SCHED_RQ); if (!et->tags[0]) goto out; } else { for (i = 0; i < et->nr_hw_queues; i++) { et->tags[i] = blk_mq_alloc_map_and_rqs(set, i, et->nr_requests); if (!et->tags[i]) goto out_unwind; } } return et; out_unwind: while (--i >= 0) blk_mq_free_map_and_rqs(set, et->tags[i], i); out: kfree(et); return NULL; } int blk_mq_alloc_sched_tags_batch(struct xarray *et_table, struct blk_mq_tag_set *set, unsigned int nr_hw_queues) { struct request_queue *q; struct elevator_tags *et; gfp_t gfp = GFP_NOIO | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY; lockdep_assert_held_write(&set->update_nr_hwq_lock); list_for_each_entry(q, &set->tag_list, tag_set_list) { /* * Accessing q->elevator without holding q->elevator_lock is * safe because we're holding here set->update_nr_hwq_lock in * the writer context. So, scheduler update/switch code (which * acquires the same lock but in the reader context) can't run * concurrently. */ if (q->elevator) { et = blk_mq_alloc_sched_tags(set, nr_hw_queues, blk_mq_default_nr_requests(set)); if (!et) goto out_unwind; if (xa_insert(et_table, q->id, et, gfp)) goto out_free_tags; } } return 0; out_free_tags: blk_mq_free_sched_tags(et, set); out_unwind: list_for_each_entry_continue_reverse(q, &set->tag_list, tag_set_list) { if (q->elevator) { et = xa_load(et_table, q->id); if (et) blk_mq_free_sched_tags(et, set); } } return -ENOMEM; } /* caller must have a reference to @e, will grab another one if successful */ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e, struct elevator_tags *et) { unsigned int flags = q->tag_set->flags; struct blk_mq_hw_ctx *hctx; struct elevator_queue *eq; unsigned long i; int ret; eq = elevator_alloc(q, e, et); if (!eq) return -ENOMEM; q->nr_requests = et->nr_requests; if (blk_mq_is_shared_tags(flags)) { /* Shared tags are stored at index 0 in @et->tags. */ q->sched_shared_tags = et->tags[0]; blk_mq_tag_update_sched_shared_tags(q); } queue_for_each_hw_ctx(q, hctx, i) { if (blk_mq_is_shared_tags(flags)) hctx->sched_tags = q->sched_shared_tags; else hctx->sched_tags = et->tags[i]; } ret = e->ops.init_sched(q, eq); if (ret) goto out; queue_for_each_hw_ctx(q, hctx, i) { if (e->ops.init_hctx) { ret = e->ops.init_hctx(hctx, i); if (ret) { blk_mq_exit_sched(q, eq); kobject_put(&eq->kobj); return ret; } } } return 0; out: blk_mq_sched_tags_teardown(q, flags); kobject_put(&eq->kobj); q->elevator = NULL; return ret; } /* * called in either blk_queue_cleanup or elevator_switch, tagset * is required for freeing requests */ void blk_mq_sched_free_rqs(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; unsigned long i; if (blk_mq_is_shared_tags(q->tag_set->flags)) { blk_mq_free_rqs(q->tag_set, q->sched_shared_tags, BLK_MQ_NO_HCTX_IDX); } else { queue_for_each_hw_ctx(q, hctx, i) { if (hctx->sched_tags) blk_mq_free_rqs(q->tag_set, hctx->sched_tags, i); } } } void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) { struct blk_mq_hw_ctx *hctx; unsigned long i; unsigned int flags = 0; queue_for_each_hw_ctx(q, hctx, i) { if (e->type->ops.exit_hctx && hctx->sched_data) { e->type->ops.exit_hctx(hctx, i); hctx->sched_data = NULL; } flags = hctx->flags; } if (e->type->ops.exit_sched) e->type->ops.exit_sched(e); blk_mq_sched_tags_teardown(q, flags); set_bit(ELEVATOR_FLAG_DYING, &q->elevator->flags); q->elevator = NULL; }
5 5 5 24 1 8 15 16 8 1 7 17 17 9 8 8 8 8 24 24 24 24 24 1 16 8 4 12 7 3 12 15 15 3 1 2 2 2 1 2 1 1 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/cls_matchll.c Match-all classifier * * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/percpu.h> #include <net/sch_generic.h> #include <net/pkt_cls.h> #include <net/tc_wrapper.h> struct cls_mall_head { struct tcf_exts exts; struct tcf_result res; u32 handle; u32 flags; unsigned int in_hw_count; struct tc_matchall_pcnt __percpu *pf; struct rcu_work rwork; bool deleting; }; TC_INDIRECT_SCOPE int mall_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { struct cls_mall_head *head = rcu_dereference_bh(tp->root); if (unlikely(!head)) return -1; if (tc_skip_sw(head->flags)) return -1; *res = head->res; __this_cpu_inc(head->pf->rhit); return tcf_exts_exec(skb, &head->exts, res); } static int mall_init(struct tcf_proto *tp) { return 0; } static void __mall_destroy(struct cls_mall_head *head) { tcf_exts_destroy(&head->exts); tcf_exts_put_net(&head->exts); free_percpu(head->pf); kfree(head); } static void mall_destroy_work(struct work_struct *work) { struct cls_mall_head *head = container_of(to_rcu_work(work), struct cls_mall_head, rwork); rtnl_lock(); __mall_destroy(head); rtnl_unlock(); } static void mall_destroy_hw_filter(struct tcf_proto *tp, struct cls_mall_head *head, unsigned long cookie, struct netlink_ext_ack *extack) { struct tc_cls_matchall_offload cls_mall = {}; struct tcf_block *block = tp->chain->block; tc_cls_common_offload_init(&cls_mall.common, tp, head->flags, extack); cls_mall.command = TC_CLSMATCHALL_DESTROY; cls_mall.cookie = cookie; tc_setup_cb_destroy(block, tp, TC_SETUP_CLSMATCHALL, &cls_mall, false, &head->flags, &head->in_hw_count, true); } static int mall_replace_hw_filter(struct tcf_proto *tp, struct cls_mall_head *head, unsigned long cookie, struct netlink_ext_ack *extack) { struct tc_cls_matchall_offload cls_mall = {}; struct tcf_block *block = tp->chain->block; bool skip_sw = tc_skip_sw(head->flags); int err; cls_mall.rule = flow_rule_alloc(tcf_exts_num_actions(&head->exts)); if (!cls_mall.rule) return -ENOMEM; tc_cls_common_offload_init(&cls_mall.common, tp, head->flags, extack); cls_mall.command = TC_CLSMATCHALL_REPLACE; cls_mall.cookie = cookie; err = tc_setup_offload_action(&cls_mall.rule->action, &head->exts, cls_mall.common.extack); if (err) { kfree(cls_mall.rule); mall_destroy_hw_filter(tp, head, cookie, NULL); return skip_sw ? err : 0; } err = tc_setup_cb_add(block, tp, TC_SETUP_CLSMATCHALL, &cls_mall, skip_sw, &head->flags, &head->in_hw_count, true); tc_cleanup_offload_action(&cls_mall.rule->action); kfree(cls_mall.rule); if (err) { mall_destroy_hw_filter(tp, head, cookie, NULL); return err; } if (skip_sw && !(head->flags & TCA_CLS_FLAGS_IN_HW)) return -EINVAL; return 0; } static void mall_destroy(struct tcf_proto *tp, bool rtnl_held, struct netlink_ext_ack *extack) { struct cls_mall_head *head = rtnl_dereference(tp->root); if (!head) return; tcf_unbind_filter(tp, &head->res); if (!tc_skip_hw(head->flags)) mall_destroy_hw_filter(tp, head, (unsigned long) head, extack); if (tcf_exts_get_net(&head->exts)) tcf_queue_work(&head->rwork, mall_destroy_work); else __mall_destroy(head); } static void *mall_get(struct tcf_proto *tp, u32 handle) { struct cls_mall_head *head = rtnl_dereference(tp->root); if (head && head->handle == handle) return head; return NULL; } static const struct nla_policy mall_policy[TCA_MATCHALL_MAX + 1] = { [TCA_MATCHALL_UNSPEC] = { .type = NLA_UNSPEC }, [TCA_MATCHALL_CLASSID] = { .type = NLA_U32 }, [TCA_MATCHALL_FLAGS] = { .type = NLA_U32 }, }; static int mall_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, void **arg, u32 flags, struct netlink_ext_ack *extack) { struct cls_mall_head *head = rtnl_dereference(tp->root); struct nlattr *tb[TCA_MATCHALL_MAX + 1]; bool bound_to_filter = false; struct cls_mall_head *new; u32 userflags = 0; int err; if (!tca[TCA_OPTIONS]) return -EINVAL; if (head) return -EEXIST; err = nla_parse_nested_deprecated(tb, TCA_MATCHALL_MAX, tca[TCA_OPTIONS], mall_policy, NULL); if (err < 0) return err; if (tb[TCA_MATCHALL_FLAGS]) { userflags = nla_get_u32(tb[TCA_MATCHALL_FLAGS]); if (!tc_flags_valid(userflags)) return -EINVAL; } new = kzalloc(sizeof(*new), GFP_KERNEL); if (!new) return -ENOBUFS; err = tcf_exts_init(&new->exts, net, TCA_MATCHALL_ACT, 0); if (err) goto err_exts_init; if (!handle) handle = 1; new->handle = handle; new->flags = userflags; new->pf = alloc_percpu(struct tc_matchall_pcnt); if (!new->pf) { err = -ENOMEM; goto err_alloc_percpu; } err = tcf_exts_validate_ex(net, tp, tb, tca[TCA_RATE], &new->exts, flags, new->flags, extack); if (err < 0) goto err_set_parms; if (tb[TCA_MATCHALL_CLASSID]) { new->res.classid = nla_get_u32(tb[TCA_MATCHALL_CLASSID]); tcf_bind_filter(tp, &new->res, base); bound_to_filter = true; } if (!tc_skip_hw(new->flags)) { err = mall_replace_hw_filter(tp, new, (unsigned long)new, extack); if (err) goto err_replace_hw_filter; } if (!tc_in_hw(new->flags)) new->flags |= TCA_CLS_FLAGS_NOT_IN_HW; tcf_proto_update_usesw(tp, new->flags); *arg = head; rcu_assign_pointer(tp->root, new); return 0; err_replace_hw_filter: if (bound_to_filter) tcf_unbind_filter(tp, &new->res); err_set_parms: free_percpu(new->pf); err_alloc_percpu: tcf_exts_destroy(&new->exts); err_exts_init: kfree(new); return err; } static int mall_delete(struct tcf_proto *tp, void *arg, bool *last, bool rtnl_held, struct netlink_ext_ack *extack) { struct cls_mall_head *head = rtnl_dereference(tp->root); head->deleting = true; *last = true; return 0; } static void mall_walk(struct tcf_proto *tp, struct tcf_walker *arg, bool rtnl_held) { struct cls_mall_head *head = rtnl_dereference(tp->root); if (arg->count < arg->skip) goto skip; if (!head || head->deleting) return; if (arg->fn(tp, head, arg) < 0) arg->stop = 1; skip: arg->count++; } static int mall_reoffload(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb, void *cb_priv, struct netlink_ext_ack *extack) { struct cls_mall_head *head = rtnl_dereference(tp->root); struct tc_cls_matchall_offload cls_mall = {}; struct tcf_block *block = tp->chain->block; int err; if (tc_skip_hw(head->flags)) return 0; cls_mall.rule = flow_rule_alloc(tcf_exts_num_actions(&head->exts)); if (!cls_mall.rule) return -ENOMEM; tc_cls_common_offload_init(&cls_mall.common, tp, head->flags, extack); cls_mall.command = add ? TC_CLSMATCHALL_REPLACE : TC_CLSMATCHALL_DESTROY; cls_mall.cookie = (unsigned long)head; err = tc_setup_offload_action(&cls_mall.rule->action, &head->exts, cls_mall.common.extack); if (err) { kfree(cls_mall.rule); return add && tc_skip_sw(head->flags) ? err : 0; } err = tc_setup_cb_reoffload(block, tp, add, cb, TC_SETUP_CLSMATCHALL, &cls_mall, cb_priv, &head->flags, &head->in_hw_count); tc_cleanup_offload_action(&cls_mall.rule->action); kfree(cls_mall.rule); return err; } static void mall_stats_hw_filter(struct tcf_proto *tp, struct cls_mall_head *head, unsigned long cookie) { struct tc_cls_matchall_offload cls_mall = {}; struct tcf_block *block = tp->chain->block; tc_cls_common_offload_init(&cls_mall.common, tp, head->flags, NULL); cls_mall.command = TC_CLSMATCHALL_STATS; cls_mall.cookie = cookie; tc_setup_cb_call(block, TC_SETUP_CLSMATCHALL, &cls_mall, false, true); tcf_exts_hw_stats_update(&head->exts, &cls_mall.stats, cls_mall.use_act_stats); } static int mall_dump(struct net *net, struct tcf_proto *tp, void *fh, struct sk_buff *skb, struct tcmsg *t, bool rtnl_held) { struct tc_matchall_pcnt gpf = {}; struct cls_mall_head *head = fh; struct nlattr *nest; int cpu; if (!head) return skb->len; if (!tc_skip_hw(head->flags)) mall_stats_hw_filter(tp, head, (unsigned long)head); t->tcm_handle = head->handle; nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (!nest) goto nla_put_failure; if (head->res.classid && nla_put_u32(skb, TCA_MATCHALL_CLASSID, head->res.classid)) goto nla_put_failure; if (head->flags && nla_put_u32(skb, TCA_MATCHALL_FLAGS, head->flags)) goto nla_put_failure; for_each_possible_cpu(cpu) { struct tc_matchall_pcnt *pf = per_cpu_ptr(head->pf, cpu); gpf.rhit += pf->rhit; } if (nla_put_64bit(skb, TCA_MATCHALL_PCNT, sizeof(struct tc_matchall_pcnt), &gpf, TCA_MATCHALL_PAD)) goto nla_put_failure; if (tcf_exts_dump(skb, &head->exts)) goto nla_put_failure; nla_nest_end(skb, nest); if (tcf_exts_dump_stats(skb, &head->exts) < 0) goto nla_put_failure; return skb->len; nla_put_failure: nla_nest_cancel(skb, nest); return -1; } static void mall_bind_class(void *fh, u32 classid, unsigned long cl, void *q, unsigned long base) { struct cls_mall_head *head = fh; tc_cls_bind_class(classid, cl, q, &head->res, base); } static struct tcf_proto_ops cls_mall_ops __read_mostly = { .kind = "matchall", .classify = mall_classify, .init = mall_init, .destroy = mall_destroy, .get = mall_get, .change = mall_change, .delete = mall_delete, .walk = mall_walk, .reoffload = mall_reoffload, .dump = mall_dump, .bind_class = mall_bind_class, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_CLS("matchall"); static int __init cls_mall_init(void) { return register_tcf_proto_ops(&cls_mall_ops); } static void __exit cls_mall_exit(void) { unregister_tcf_proto_ops(&cls_mall_ops); } module_init(cls_mall_init); module_exit(cls_mall_exit); MODULE_AUTHOR("Jiri Pirko <jiri@mellanox.com>"); MODULE_DESCRIPTION("Match-all classifier"); MODULE_LICENSE("GPL v2");
24 16 24 24 6 16 16 8 16 6 16 16 7 16 7 16 15 15 10 15 9 9 9 9 9 8 8 8 8 8 7 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 /* * hw_random/core.c: HWRNG core API * * Copyright 2006 Michael Buesch <m@bues.ch> * Copyright 2005 (c) MontaVista Software, Inc. * * Please read Documentation/admin-guide/hw_random.rst for details on use. * * This software may be used and distributed according to the terms * of the GNU General Public License, incorporated herein by reference. */ #include <linux/delay.h> #include <linux/device.h> #include <linux/err.h> #include <linux/fs.h> #include <linux/hw_random.h> #include <linux/kernel.h> #include <linux/kthread.h> #include <linux/miscdevice.h> #include <linux/module.h> #include <linux/random.h> #include <linux/sched.h> #include <linux/sched/signal.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/uaccess.h> #define RNG_MODULE_NAME "hw_random" #define RNG_BUFFER_SIZE (SMP_CACHE_BYTES < 32 ? 32 : SMP_CACHE_BYTES) static struct hwrng *current_rng; /* the current rng has been explicitly chosen by user via sysfs */ static int cur_rng_set_by_user; static struct task_struct *hwrng_fill; /* list of registered rngs */ static LIST_HEAD(rng_list); /* Protects rng_list and current_rng */ static DEFINE_MUTEX(rng_mutex); /* Protects rng read functions, data_avail, rng_buffer and rng_fillbuf */ static DEFINE_MUTEX(reading_mutex); static int data_avail; static u8 *rng_buffer, *rng_fillbuf; static unsigned short current_quality; static unsigned short default_quality = 1024; /* default to maximum */ module_param(current_quality, ushort, 0644); MODULE_PARM_DESC(current_quality, "current hwrng entropy estimation per 1024 bits of input -- obsolete, use rng_quality instead"); module_param(default_quality, ushort, 0644); MODULE_PARM_DESC(default_quality, "default maximum entropy content of hwrng per 1024 bits of input"); static void drop_current_rng(void); static int hwrng_init(struct hwrng *rng); static int hwrng_fillfn(void *unused); static inline int rng_get_data(struct hwrng *rng, u8 *buffer, size_t size, int wait); static size_t rng_buffer_size(void) { return RNG_BUFFER_SIZE; } static inline void cleanup_rng(struct kref *kref) { struct hwrng *rng = container_of(kref, struct hwrng, ref); if (rng->cleanup) rng->cleanup(rng); complete(&rng->cleanup_done); } static int set_current_rng(struct hwrng *rng) { int err; BUG_ON(!mutex_is_locked(&rng_mutex)); err = hwrng_init(rng); if (err) return err; drop_current_rng(); current_rng = rng; /* if necessary, start hwrng thread */ if (!hwrng_fill) { hwrng_fill = kthread_run(hwrng_fillfn, NULL, "hwrng"); if (IS_ERR(hwrng_fill)) { pr_err("hwrng_fill thread creation failed\n"); hwrng_fill = NULL; } } return 0; } static void drop_current_rng(void) { BUG_ON(!mutex_is_locked(&rng_mutex)); if (!current_rng) return; /* decrease last reference for triggering the cleanup */ kref_put(&current_rng->ref, cleanup_rng); current_rng = NULL; } /* Returns ERR_PTR(), NULL or refcounted hwrng */ static struct hwrng *get_current_rng_nolock(void) { if (current_rng) kref_get(&current_rng->ref); return current_rng; } static struct hwrng *get_current_rng(void) { struct hwrng *rng; if (mutex_lock_interruptible(&rng_mutex)) return ERR_PTR(-ERESTARTSYS); rng = get_current_rng_nolock(); mutex_unlock(&rng_mutex); return rng; } static void put_rng(struct hwrng *rng) { /* * Hold rng_mutex here so we serialize in case they set_current_rng * on rng again immediately. */ mutex_lock(&rng_mutex); if (rng) kref_put(&rng->ref, cleanup_rng); mutex_unlock(&rng_mutex); } static int hwrng_init(struct hwrng *rng) { if (kref_get_unless_zero(&rng->ref)) goto skip_init; if (rng->init) { int ret; ret = rng->init(rng); if (ret) return ret; } kref_init(&rng->ref); reinit_completion(&rng->cleanup_done); skip_init: current_quality = rng->quality; /* obsolete */ return 0; } static int rng_dev_open(struct inode *inode, struct file *filp) { /* enforce read-only access to this chrdev */ if ((filp->f_mode & FMODE_READ) == 0) return -EINVAL; if (filp->f_mode & FMODE_WRITE) return -EINVAL; return 0; } static inline int rng_get_data(struct hwrng *rng, u8 *buffer, size_t size, int wait) { int present; BUG_ON(!mutex_is_locked(&reading_mutex)); if (rng->read) { int err; err = rng->read(rng, buffer, size, wait); if (WARN_ON_ONCE(err > 0 && err > size)) err = size; return err; } if (rng->data_present) present = rng->data_present(rng, wait); else present = 1; if (present) return rng->data_read(rng, (u32 *)buffer); return 0; } static ssize_t rng_dev_read(struct file *filp, char __user *buf, size_t size, loff_t *offp) { u8 buffer[RNG_BUFFER_SIZE]; ssize_t ret = 0; int err = 0; int bytes_read, len; struct hwrng *rng; while (size) { rng = get_current_rng(); if (IS_ERR(rng)) { err = PTR_ERR(rng); goto out; } if (!rng) { err = -ENODEV; goto out; } if (mutex_lock_interruptible(&reading_mutex)) { err = -ERESTARTSYS; goto out_put; } if (!data_avail) { bytes_read = rng_get_data(rng, rng_buffer, rng_buffer_size(), !(filp->f_flags & O_NONBLOCK)); if (bytes_read < 0) { err = bytes_read; goto out_unlock_reading; } else if (bytes_read == 0 && (filp->f_flags & O_NONBLOCK)) { err = -EAGAIN; goto out_unlock_reading; } data_avail = bytes_read; } len = data_avail; if (len) { if (len > size) len = size; data_avail -= len; memcpy(buffer, rng_buffer + data_avail, len); } mutex_unlock(&reading_mutex); put_rng(rng); if (len) { if (copy_to_user(buf + ret, buffer, len)) { err = -EFAULT; goto out; } size -= len; ret += len; } if (need_resched()) schedule_timeout_interruptible(1); if (signal_pending(current)) { err = -ERESTARTSYS; goto out; } } out: memzero_explicit(buffer, sizeof(buffer)); return ret ? : err; out_unlock_reading: mutex_unlock(&reading_mutex); out_put: put_rng(rng); goto out; } static const struct file_operations rng_chrdev_ops = { .owner = THIS_MODULE, .open = rng_dev_open, .read = rng_dev_read, .llseek = noop_llseek, }; static const struct attribute_group *rng_dev_groups[]; static struct miscdevice rng_miscdev = { .minor = HWRNG_MINOR, .name = RNG_MODULE_NAME, .nodename = "hwrng", .fops = &rng_chrdev_ops, .groups = rng_dev_groups, }; static int enable_best_rng(void) { struct hwrng *rng, *new_rng = NULL; int ret = -ENODEV; BUG_ON(!mutex_is_locked(&rng_mutex)); /* no rng to use? */ if (list_empty(&rng_list)) { drop_current_rng(); cur_rng_set_by_user = 0; return 0; } /* use the rng which offers the best quality */ list_for_each_entry(rng, &rng_list, list) { if (!new_rng || rng->quality > new_rng->quality) new_rng = rng; } ret = ((new_rng == current_rng) ? 0 : set_current_rng(new_rng)); if (!ret) cur_rng_set_by_user = 0; return ret; } static ssize_t rng_current_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { int err; struct hwrng *rng, *new_rng; err = mutex_lock_interruptible(&rng_mutex); if (err) return -ERESTARTSYS; if (sysfs_streq(buf, "")) { err = enable_best_rng(); } else { list_for_each_entry(rng, &rng_list, list) { if (sysfs_streq(rng->name, buf)) { err = set_current_rng(rng); if (!err) cur_rng_set_by_user = 1; break; } } } new_rng = get_current_rng_nolock(); mutex_unlock(&rng_mutex); if (new_rng) put_rng(new_rng); return err ? : len; } static ssize_t rng_current_show(struct device *dev, struct device_attribute *attr, char *buf) { ssize_t ret; struct hwrng *rng; rng = get_current_rng(); if (IS_ERR(rng)) return PTR_ERR(rng); ret = sysfs_emit(buf, "%s\n", rng ? rng->name : "none"); put_rng(rng); return ret; } static ssize_t rng_available_show(struct device *dev, struct device_attribute *attr, char *buf) { int err; struct hwrng *rng; err = mutex_lock_interruptible(&rng_mutex); if (err) return -ERESTARTSYS; buf[0] = '\0'; list_for_each_entry(rng, &rng_list, list) { strlcat(buf, rng->name, PAGE_SIZE); strlcat(buf, " ", PAGE_SIZE); } strlcat(buf, "\n", PAGE_SIZE); mutex_unlock(&rng_mutex); return strlen(buf); } static ssize_t rng_selected_show(struct device *dev, struct device_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", cur_rng_set_by_user); } static ssize_t rng_quality_show(struct device *dev, struct device_attribute *attr, char *buf) { ssize_t ret; struct hwrng *rng; rng = get_current_rng(); if (IS_ERR(rng)) return PTR_ERR(rng); if (!rng) /* no need to put_rng */ return -ENODEV; ret = sysfs_emit(buf, "%hu\n", rng->quality); put_rng(rng); return ret; } static ssize_t rng_quality_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { u16 quality; int ret = -EINVAL; if (len < 2) return -EINVAL; ret = mutex_lock_interruptible(&rng_mutex); if (ret) return -ERESTARTSYS; ret = kstrtou16(buf, 0, &quality); if (ret || quality > 1024) { ret = -EINVAL; goto out; } if (!current_rng) { ret = -ENODEV; goto out; } current_rng->quality = quality; current_quality = quality; /* obsolete */ /* the best available RNG may have changed */ ret = enable_best_rng(); out: mutex_unlock(&rng_mutex); return ret ? ret : len; } static DEVICE_ATTR_RW(rng_current); static DEVICE_ATTR_RO(rng_available); static DEVICE_ATTR_RO(rng_selected); static DEVICE_ATTR_RW(rng_quality); static struct attribute *rng_dev_attrs[] = { &dev_attr_rng_current.attr, &dev_attr_rng_available.attr, &dev_attr_rng_selected.attr, &dev_attr_rng_quality.attr, NULL }; ATTRIBUTE_GROUPS(rng_dev); static int hwrng_fillfn(void *unused) { size_t entropy, entropy_credit = 0; /* in 1/1024 of a bit */ long rc; while (!kthread_should_stop()) { unsigned short quality; struct hwrng *rng; rng = get_current_rng(); if (IS_ERR(rng) || !rng) break; mutex_lock(&reading_mutex); rc = rng_get_data(rng, rng_fillbuf, rng_buffer_size(), 1); if (current_quality != rng->quality) rng->quality = current_quality; /* obsolete */ quality = rng->quality; mutex_unlock(&reading_mutex); if (rc <= 0) hwrng_msleep(rng, 10000); put_rng(rng); if (rc <= 0) continue; /* If we cannot credit at least one bit of entropy, * keep track of the remainder for the next iteration */ entropy = rc * quality * 8 + entropy_credit; if ((entropy >> 10) == 0) entropy_credit = entropy; /* Outside lock, sure, but y'know: randomness. */ add_hwgenerator_randomness((void *)rng_fillbuf, rc, entropy >> 10, true); } hwrng_fill = NULL; return 0; } int hwrng_register(struct hwrng *rng) { int err = -EINVAL; struct hwrng *tmp; if (!rng->name || (!rng->data_read && !rng->read)) goto out; mutex_lock(&rng_mutex); /* Must not register two RNGs with the same name. */ err = -EEXIST; list_for_each_entry(tmp, &rng_list, list) { if (strcmp(tmp->name, rng->name) == 0) goto out_unlock; } list_add_tail(&rng->list, &rng_list); init_completion(&rng->cleanup_done); complete(&rng->cleanup_done); init_completion(&rng->dying); /* Adjust quality field to always have a proper value */ rng->quality = min_t(u16, min_t(u16, default_quality, 1024), rng->quality ?: 1024); if (!current_rng || (!cur_rng_set_by_user && rng->quality > current_rng->quality)) { /* * Set new rng as current as the new rng source * provides better entropy quality and was not * chosen by userspace. */ err = set_current_rng(rng); if (err) goto out_unlock; } mutex_unlock(&rng_mutex); return 0; out_unlock: mutex_unlock(&rng_mutex); out: return err; } EXPORT_SYMBOL_GPL(hwrng_register); void hwrng_unregister(struct hwrng *rng) { struct hwrng *new_rng; int err; mutex_lock(&rng_mutex); list_del(&rng->list); complete_all(&rng->dying); if (current_rng == rng) { err = enable_best_rng(); if (err) { drop_current_rng(); cur_rng_set_by_user = 0; } } new_rng = get_current_rng_nolock(); if (list_empty(&rng_list)) { mutex_unlock(&rng_mutex); if (hwrng_fill) kthread_stop(hwrng_fill); } else mutex_unlock(&rng_mutex); if (new_rng) put_rng(new_rng); wait_for_completion(&rng->cleanup_done); } EXPORT_SYMBOL_GPL(hwrng_unregister); static void devm_hwrng_release(struct device *dev, void *res) { hwrng_unregister(*(struct hwrng **)res); } static int devm_hwrng_match(struct device *dev, void *res, void *data) { struct hwrng **r = res; if (WARN_ON(!r || !*r)) return 0; return *r == data; } int devm_hwrng_register(struct device *dev, struct hwrng *rng) { struct hwrng **ptr; int error; ptr = devres_alloc(devm_hwrng_release, sizeof(*ptr), GFP_KERNEL); if (!ptr) return -ENOMEM; error = hwrng_register(rng); if (error) { devres_free(ptr); return error; } *ptr = rng; devres_add(dev, ptr); return 0; } EXPORT_SYMBOL_GPL(devm_hwrng_register); void devm_hwrng_unregister(struct device *dev, struct hwrng *rng) { devres_release(dev, devm_hwrng_release, devm_hwrng_match, rng); } EXPORT_SYMBOL_GPL(devm_hwrng_unregister); long hwrng_msleep(struct hwrng *rng, unsigned int msecs) { unsigned long timeout = msecs_to_jiffies(msecs) + 1; return wait_for_completion_interruptible_timeout(&rng->dying, timeout); } EXPORT_SYMBOL_GPL(hwrng_msleep); long hwrng_yield(struct hwrng *rng) { return wait_for_completion_interruptible_timeout(&rng->dying, 1); } EXPORT_SYMBOL_GPL(hwrng_yield); static int __init hwrng_modinit(void) { int ret; /* kmalloc makes this safe for virt_to_page() in virtio_rng.c */ rng_buffer = kmalloc(rng_buffer_size(), GFP_KERNEL); if (!rng_buffer) return -ENOMEM; rng_fillbuf = kmalloc(rng_buffer_size(), GFP_KERNEL); if (!rng_fillbuf) { kfree(rng_buffer); return -ENOMEM; } ret = misc_register(&rng_miscdev); if (ret) { kfree(rng_fillbuf); kfree(rng_buffer); } return ret; } static void __exit hwrng_modexit(void) { mutex_lock(&rng_mutex); BUG_ON(current_rng); kfree(rng_buffer); kfree(rng_fillbuf); mutex_unlock(&rng_mutex); misc_deregister(&rng_miscdev); } fs_initcall(hwrng_modinit); /* depends on misc_register() */ module_exit(hwrng_modexit); MODULE_DESCRIPTION("H/W Random Number Generator (RNG) driver"); MODULE_LICENSE("GPL");
62 62 62 62 62 67 1 2 2 62 5 5 5 2 1 1 1 7 2 5 5 3 2 70 17 53 53 7 7 1 650 15 5 630 630 27 170 539 528 11 307 237 117 117 321 321 190 37 69 123 264 47 70 17 83 49 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 #include <linux/bpf.h> #include <linux/btf.h> #include <linux/err.h> #include <linux/irq_work.h> #include <linux/slab.h> #include <linux/filter.h> #include <linux/mm.h> #include <linux/vmalloc.h> #include <linux/wait.h> #include <linux/poll.h> #include <linux/kmemleak.h> #include <uapi/linux/btf.h> #include <linux/btf_ids.h> #include <asm/rqspinlock.h> #define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE) /* non-mmap()'able part of bpf_ringbuf (everything up to consumer page) */ #define RINGBUF_PGOFF \ (offsetof(struct bpf_ringbuf, consumer_pos) >> PAGE_SHIFT) /* consumer page and producer page */ #define RINGBUF_POS_PAGES 2 #define RINGBUF_NR_META_PAGES (RINGBUF_PGOFF + RINGBUF_POS_PAGES) #define RINGBUF_MAX_RECORD_SZ (UINT_MAX/4) struct bpf_ringbuf { wait_queue_head_t waitq; struct irq_work work; u64 mask; struct page **pages; int nr_pages; rqspinlock_t spinlock ____cacheline_aligned_in_smp; /* For user-space producer ring buffers, an atomic_t busy bit is used * to synchronize access to the ring buffers in the kernel, rather than * the spinlock that is used for kernel-producer ring buffers. This is * done because the ring buffer must hold a lock across a BPF program's * callback: * * __bpf_user_ringbuf_peek() // lock acquired * -> program callback_fn() * -> __bpf_user_ringbuf_sample_release() // lock released * * It is unsafe and incorrect to hold an IRQ spinlock across what could * be a long execution window, so we instead simply disallow concurrent * access to the ring buffer by kernel consumers, and return -EBUSY from * __bpf_user_ringbuf_peek() if the busy bit is held by another task. */ atomic_t busy ____cacheline_aligned_in_smp; /* Consumer and producer counters are put into separate pages to * allow each position to be mapped with different permissions. * This prevents a user-space application from modifying the * position and ruining in-kernel tracking. The permissions of the * pages depend on who is producing samples: user-space or the * kernel. Note that the pending counter is placed in the same * page as the producer, so that it shares the same cache line. * * Kernel-producer * --------------- * The producer position and data pages are mapped as r/o in * userspace. For this approach, bits in the header of samples are * used to signal to user-space, and to other producers, whether a * sample is currently being written. * * User-space producer * ------------------- * Only the page containing the consumer position is mapped r/o in * user-space. User-space producers also use bits of the header to * communicate to the kernel, but the kernel must carefully check and * validate each sample to ensure that they're correctly formatted, and * fully contained within the ring buffer. */ unsigned long consumer_pos __aligned(PAGE_SIZE); unsigned long producer_pos __aligned(PAGE_SIZE); unsigned long pending_pos; char data[] __aligned(PAGE_SIZE); }; struct bpf_ringbuf_map { struct bpf_map map; struct bpf_ringbuf *rb; }; /* 8-byte ring buffer record header structure */ struct bpf_ringbuf_hdr { u32 len; u32 pg_off; }; static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node) { const gfp_t flags = GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL | __GFP_NOWARN | __GFP_ZERO; int nr_meta_pages = RINGBUF_NR_META_PAGES; int nr_data_pages = data_sz >> PAGE_SHIFT; int nr_pages = nr_meta_pages + nr_data_pages; struct page **pages, *page; struct bpf_ringbuf *rb; size_t array_size; int i; /* Each data page is mapped twice to allow "virtual" * continuous read of samples wrapping around the end of ring * buffer area: * ------------------------------------------------------ * | meta pages | real data pages | same data pages | * ------------------------------------------------------ * | | 1 2 3 4 5 6 7 8 9 | 1 2 3 4 5 6 7 8 9 | * ------------------------------------------------------ * | | TA DA | TA DA | * ------------------------------------------------------ * ^^^^^^^ * | * Here, no need to worry about special handling of wrapped-around * data due to double-mapped data pages. This works both in kernel and * when mmap()'ed in user-space, simplifying both kernel and * user-space implementations significantly. */ array_size = (nr_meta_pages + 2 * nr_data_pages) * sizeof(*pages); pages = bpf_map_area_alloc(array_size, numa_node); if (!pages) return NULL; for (i = 0; i < nr_pages; i++) { page = alloc_pages_node(numa_node, flags, 0); if (!page) { nr_pages = i; goto err_free_pages; } pages[i] = page; if (i >= nr_meta_pages) pages[nr_data_pages + i] = page; } rb = vmap(pages, nr_meta_pages + 2 * nr_data_pages, VM_MAP | VM_USERMAP, PAGE_KERNEL); if (rb) { kmemleak_not_leak(pages); rb->pages = pages; rb->nr_pages = nr_pages; return rb; } err_free_pages: for (i = 0; i < nr_pages; i++) __free_page(pages[i]); bpf_map_area_free(pages); return NULL; } static void bpf_ringbuf_notify(struct irq_work *work) { struct bpf_ringbuf *rb = container_of(work, struct bpf_ringbuf, work); wake_up_all(&rb->waitq); } /* Maximum size of ring buffer area is limited by 32-bit page offset within * record header, counted in pages. Reserve 8 bits for extensibility, and * take into account few extra pages for consumer/producer pages and * non-mmap()'able parts, the current maximum size would be: * * (((1ULL << 24) - RINGBUF_POS_PAGES - RINGBUF_PGOFF) * PAGE_SIZE) * * This gives 64GB limit, which seems plenty for single ring buffer. Now * considering that the maximum value of data_sz is (4GB - 1), there * will be no overflow, so just note the size limit in the comments. */ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) { struct bpf_ringbuf *rb; rb = bpf_ringbuf_area_alloc(data_sz, numa_node); if (!rb) return NULL; raw_res_spin_lock_init(&rb->spinlock); atomic_set(&rb->busy, 0); init_waitqueue_head(&rb->waitq); init_irq_work(&rb->work, bpf_ringbuf_notify); rb->mask = data_sz - 1; rb->consumer_pos = 0; rb->producer_pos = 0; rb->pending_pos = 0; return rb; } static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr) { struct bpf_ringbuf_map *rb_map; if (attr->map_flags & ~RINGBUF_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); if (attr->key_size || attr->value_size || !is_power_of_2(attr->max_entries) || !PAGE_ALIGNED(attr->max_entries)) return ERR_PTR(-EINVAL); rb_map = bpf_map_area_alloc(sizeof(*rb_map), NUMA_NO_NODE); if (!rb_map) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&rb_map->map, attr); rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node); if (!rb_map->rb) { bpf_map_area_free(rb_map); return ERR_PTR(-ENOMEM); } return &rb_map->map; } static void bpf_ringbuf_free(struct bpf_ringbuf *rb) { /* copy pages pointer and nr_pages to local variable, as we are going * to unmap rb itself with vunmap() below */ struct page **pages = rb->pages; int i, nr_pages = rb->nr_pages; vunmap(rb); for (i = 0; i < nr_pages; i++) __free_page(pages[i]); bpf_map_area_free(pages); } static void ringbuf_map_free(struct bpf_map *map) { struct bpf_ringbuf_map *rb_map; rb_map = container_of(map, struct bpf_ringbuf_map, map); bpf_ringbuf_free(rb_map->rb); bpf_map_area_free(rb_map); } static void *ringbuf_map_lookup_elem(struct bpf_map *map, void *key) { return ERR_PTR(-ENOTSUPP); } static long ringbuf_map_update_elem(struct bpf_map *map, void *key, void *value, u64 flags) { return -ENOTSUPP; } static long ringbuf_map_delete_elem(struct bpf_map *map, void *key) { return -ENOTSUPP; } static int ringbuf_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { return -ENOTSUPP; } static int ringbuf_map_mmap_kern(struct bpf_map *map, struct vm_area_struct *vma) { struct bpf_ringbuf_map *rb_map; rb_map = container_of(map, struct bpf_ringbuf_map, map); if (vma->vm_flags & VM_WRITE) { /* allow writable mapping for the consumer_pos only */ if (vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != PAGE_SIZE) return -EPERM; } /* remap_vmalloc_range() checks size and offset constraints */ return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF); } static int ringbuf_map_mmap_user(struct bpf_map *map, struct vm_area_struct *vma) { struct bpf_ringbuf_map *rb_map; rb_map = container_of(map, struct bpf_ringbuf_map, map); if (vma->vm_flags & VM_WRITE) { if (vma->vm_pgoff == 0) /* Disallow writable mappings to the consumer pointer, * and allow writable mappings to both the producer * position, and the ring buffer data itself. */ return -EPERM; } /* remap_vmalloc_range() checks size and offset constraints */ return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF); } static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb) { unsigned long cons_pos, prod_pos; cons_pos = smp_load_acquire(&rb->consumer_pos); prod_pos = smp_load_acquire(&rb->producer_pos); return prod_pos - cons_pos; } static u32 ringbuf_total_data_sz(const struct bpf_ringbuf *rb) { return rb->mask + 1; } static __poll_t ringbuf_map_poll_kern(struct bpf_map *map, struct file *filp, struct poll_table_struct *pts) { struct bpf_ringbuf_map *rb_map; rb_map = container_of(map, struct bpf_ringbuf_map, map); poll_wait(filp, &rb_map->rb->waitq, pts); if (ringbuf_avail_data_sz(rb_map->rb)) return EPOLLIN | EPOLLRDNORM; return 0; } static __poll_t ringbuf_map_poll_user(struct bpf_map *map, struct file *filp, struct poll_table_struct *pts) { struct bpf_ringbuf_map *rb_map; rb_map = container_of(map, struct bpf_ringbuf_map, map); poll_wait(filp, &rb_map->rb->waitq, pts); if (ringbuf_avail_data_sz(rb_map->rb) < ringbuf_total_data_sz(rb_map->rb)) return EPOLLOUT | EPOLLWRNORM; return 0; } static u64 ringbuf_map_mem_usage(const struct bpf_map *map) { struct bpf_ringbuf *rb; int nr_data_pages; int nr_meta_pages; u64 usage = sizeof(struct bpf_ringbuf_map); rb = container_of(map, struct bpf_ringbuf_map, map)->rb; usage += (u64)rb->nr_pages << PAGE_SHIFT; nr_meta_pages = RINGBUF_NR_META_PAGES; nr_data_pages = map->max_entries >> PAGE_SHIFT; usage += (nr_meta_pages + 2 * nr_data_pages) * sizeof(struct page *); return usage; } BTF_ID_LIST_SINGLE(ringbuf_map_btf_ids, struct, bpf_ringbuf_map) const struct bpf_map_ops ringbuf_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = ringbuf_map_alloc, .map_free = ringbuf_map_free, .map_mmap = ringbuf_map_mmap_kern, .map_poll = ringbuf_map_poll_kern, .map_lookup_elem = ringbuf_map_lookup_elem, .map_update_elem = ringbuf_map_update_elem, .map_delete_elem = ringbuf_map_delete_elem, .map_get_next_key = ringbuf_map_get_next_key, .map_mem_usage = ringbuf_map_mem_usage, .map_btf_id = &ringbuf_map_btf_ids[0], }; BTF_ID_LIST_SINGLE(user_ringbuf_map_btf_ids, struct, bpf_ringbuf_map) const struct bpf_map_ops user_ringbuf_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = ringbuf_map_alloc, .map_free = ringbuf_map_free, .map_mmap = ringbuf_map_mmap_user, .map_poll = ringbuf_map_poll_user, .map_lookup_elem = ringbuf_map_lookup_elem, .map_update_elem = ringbuf_map_update_elem, .map_delete_elem = ringbuf_map_delete_elem, .map_get_next_key = ringbuf_map_get_next_key, .map_mem_usage = ringbuf_map_mem_usage, .map_btf_id = &user_ringbuf_map_btf_ids[0], }; /* Given pointer to ring buffer record metadata and struct bpf_ringbuf itself, * calculate offset from record metadata to ring buffer in pages, rounded * down. This page offset is stored as part of record metadata and allows to * restore struct bpf_ringbuf * from record pointer. This page offset is * stored at offset 4 of record metadata header. */ static size_t bpf_ringbuf_rec_pg_off(struct bpf_ringbuf *rb, struct bpf_ringbuf_hdr *hdr) { return ((void *)hdr - (void *)rb) >> PAGE_SHIFT; } /* Given pointer to ring buffer record header, restore pointer to struct * bpf_ringbuf itself by using page offset stored at offset 4 */ static struct bpf_ringbuf * bpf_ringbuf_restore_from_rec(struct bpf_ringbuf_hdr *hdr) { unsigned long addr = (unsigned long)(void *)hdr; unsigned long off = (unsigned long)hdr->pg_off << PAGE_SHIFT; return (void*)((addr & PAGE_MASK) - off); } static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) { unsigned long cons_pos, prod_pos, new_prod_pos, pend_pos, flags; struct bpf_ringbuf_hdr *hdr; u32 len, pg_off, tmp_size, hdr_len; if (unlikely(size > RINGBUF_MAX_RECORD_SZ)) return NULL; len = round_up(size + BPF_RINGBUF_HDR_SZ, 8); if (len > ringbuf_total_data_sz(rb)) return NULL; cons_pos = smp_load_acquire(&rb->consumer_pos); if (raw_res_spin_lock_irqsave(&rb->spinlock, flags)) return NULL; pend_pos = rb->pending_pos; prod_pos = rb->producer_pos; new_prod_pos = prod_pos + len; while (pend_pos < prod_pos) { hdr = (void *)rb->data + (pend_pos & rb->mask); hdr_len = READ_ONCE(hdr->len); if (hdr_len & BPF_RINGBUF_BUSY_BIT) break; tmp_size = hdr_len & ~BPF_RINGBUF_DISCARD_BIT; tmp_size = round_up(tmp_size + BPF_RINGBUF_HDR_SZ, 8); pend_pos += tmp_size; } rb->pending_pos = pend_pos; /* check for out of ringbuf space: * - by ensuring producer position doesn't advance more than * (ringbuf_size - 1) ahead * - by ensuring oldest not yet committed record until newest * record does not span more than (ringbuf_size - 1) */ if (new_prod_pos - cons_pos > rb->mask || new_prod_pos - pend_pos > rb->mask) { raw_res_spin_unlock_irqrestore(&rb->spinlock, flags); return NULL; } hdr = (void *)rb->data + (prod_pos & rb->mask); pg_off = bpf_ringbuf_rec_pg_off(rb, hdr); hdr->len = size | BPF_RINGBUF_BUSY_BIT; hdr->pg_off = pg_off; /* pairs with consumer's smp_load_acquire() */ smp_store_release(&rb->producer_pos, new_prod_pos); raw_res_spin_unlock_irqrestore(&rb->spinlock, flags); return (void *)hdr + BPF_RINGBUF_HDR_SZ; } BPF_CALL_3(bpf_ringbuf_reserve, struct bpf_map *, map, u64, size, u64, flags) { struct bpf_ringbuf_map *rb_map; if (unlikely(flags)) return 0; rb_map = container_of(map, struct bpf_ringbuf_map, map); return (unsigned long)__bpf_ringbuf_reserve(rb_map->rb, size); } const struct bpf_func_proto bpf_ringbuf_reserve_proto = { .func = bpf_ringbuf_reserve, .ret_type = RET_PTR_TO_RINGBUF_MEM_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_CONST_ALLOC_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, }; static void bpf_ringbuf_commit(void *sample, u64 flags, bool discard) { unsigned long rec_pos, cons_pos; struct bpf_ringbuf_hdr *hdr; struct bpf_ringbuf *rb; u32 new_len; hdr = sample - BPF_RINGBUF_HDR_SZ; rb = bpf_ringbuf_restore_from_rec(hdr); new_len = hdr->len ^ BPF_RINGBUF_BUSY_BIT; if (discard) new_len |= BPF_RINGBUF_DISCARD_BIT; /* update record header with correct final size prefix */ xchg(&hdr->len, new_len); /* if consumer caught up and is waiting for our record, notify about * new data availability */ rec_pos = (void *)hdr - (void *)rb->data; cons_pos = smp_load_acquire(&rb->consumer_pos) & rb->mask; if (flags & BPF_RB_FORCE_WAKEUP) irq_work_queue(&rb->work); else if (cons_pos == rec_pos && !(flags & BPF_RB_NO_WAKEUP)) irq_work_queue(&rb->work); } BPF_CALL_2(bpf_ringbuf_submit, void *, sample, u64, flags) { bpf_ringbuf_commit(sample, flags, false /* discard */); return 0; } const struct bpf_func_proto bpf_ringbuf_submit_proto = { .func = bpf_ringbuf_submit, .ret_type = RET_VOID, .arg1_type = ARG_PTR_TO_RINGBUF_MEM | OBJ_RELEASE, .arg2_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_ringbuf_discard, void *, sample, u64, flags) { bpf_ringbuf_commit(sample, flags, true /* discard */); return 0; } const struct bpf_func_proto bpf_ringbuf_discard_proto = { .func = bpf_ringbuf_discard, .ret_type = RET_VOID, .arg1_type = ARG_PTR_TO_RINGBUF_MEM | OBJ_RELEASE, .arg2_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_ringbuf_output, struct bpf_map *, map, void *, data, u64, size, u64, flags) { struct bpf_ringbuf_map *rb_map; void *rec; if (unlikely(flags & ~(BPF_RB_NO_WAKEUP | BPF_RB_FORCE_WAKEUP))) return -EINVAL; rb_map = container_of(map, struct bpf_ringbuf_map, map); rec = __bpf_ringbuf_reserve(rb_map->rb, size); if (!rec) return -EAGAIN; memcpy(rec, data, size); bpf_ringbuf_commit(rec, flags, false /* discard */); return 0; } const struct bpf_func_proto bpf_ringbuf_output_proto = { .func = bpf_ringbuf_output, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_ringbuf_query, struct bpf_map *, map, u64, flags) { struct bpf_ringbuf *rb; rb = container_of(map, struct bpf_ringbuf_map, map)->rb; switch (flags) { case BPF_RB_AVAIL_DATA: return ringbuf_avail_data_sz(rb); case BPF_RB_RING_SIZE: return ringbuf_total_data_sz(rb); case BPF_RB_CONS_POS: return smp_load_acquire(&rb->consumer_pos); case BPF_RB_PROD_POS: return smp_load_acquire(&rb->producer_pos); default: return 0; } } const struct bpf_func_proto bpf_ringbuf_query_proto = { .func = bpf_ringbuf_query, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_ringbuf_reserve_dynptr, struct bpf_map *, map, u32, size, u64, flags, struct bpf_dynptr_kern *, ptr) { struct bpf_ringbuf_map *rb_map; void *sample; int err; if (unlikely(flags)) { bpf_dynptr_set_null(ptr); return -EINVAL; } err = bpf_dynptr_check_size(size); if (err) { bpf_dynptr_set_null(ptr); return err; } rb_map = container_of(map, struct bpf_ringbuf_map, map); sample = __bpf_ringbuf_reserve(rb_map->rb, size); if (!sample) { bpf_dynptr_set_null(ptr); return -EINVAL; } bpf_dynptr_init(ptr, sample, BPF_DYNPTR_TYPE_RINGBUF, 0, size); return 0; } const struct bpf_func_proto bpf_ringbuf_reserve_dynptr_proto = { .func = bpf_ringbuf_reserve_dynptr, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | MEM_UNINIT | MEM_WRITE, }; BPF_CALL_2(bpf_ringbuf_submit_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags) { if (!ptr->data) return 0; bpf_ringbuf_commit(ptr->data, flags, false /* discard */); bpf_dynptr_set_null(ptr); return 0; } const struct bpf_func_proto bpf_ringbuf_submit_dynptr_proto = { .func = bpf_ringbuf_submit_dynptr, .ret_type = RET_VOID, .arg1_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | OBJ_RELEASE, .arg2_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_ringbuf_discard_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags) { if (!ptr->data) return 0; bpf_ringbuf_commit(ptr->data, flags, true /* discard */); bpf_dynptr_set_null(ptr); return 0; } const struct bpf_func_proto bpf_ringbuf_discard_dynptr_proto = { .func = bpf_ringbuf_discard_dynptr, .ret_type = RET_VOID, .arg1_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | OBJ_RELEASE, .arg2_type = ARG_ANYTHING, }; static int __bpf_user_ringbuf_peek(struct bpf_ringbuf *rb, void **sample, u32 *size) { int err; u32 hdr_len, sample_len, total_len, flags, *hdr; u64 cons_pos, prod_pos; /* Synchronizes with smp_store_release() in user-space producer. */ prod_pos = smp_load_acquire(&rb->producer_pos); if (prod_pos % 8) return -EINVAL; /* Synchronizes with smp_store_release() in __bpf_user_ringbuf_sample_release() */ cons_pos = smp_load_acquire(&rb->consumer_pos); if (cons_pos >= prod_pos) return -ENODATA; hdr = (u32 *)((uintptr_t)rb->data + (uintptr_t)(cons_pos & rb->mask)); /* Synchronizes with smp_store_release() in user-space producer. */ hdr_len = smp_load_acquire(hdr); flags = hdr_len & (BPF_RINGBUF_BUSY_BIT | BPF_RINGBUF_DISCARD_BIT); sample_len = hdr_len & ~flags; total_len = round_up(sample_len + BPF_RINGBUF_HDR_SZ, 8); /* The sample must fit within the region advertised by the producer position. */ if (total_len > prod_pos - cons_pos) return -EINVAL; /* The sample must fit within the data region of the ring buffer. */ if (total_len > ringbuf_total_data_sz(rb)) return -E2BIG; /* The sample must fit into a struct bpf_dynptr. */ err = bpf_dynptr_check_size(sample_len); if (err) return -E2BIG; if (flags & BPF_RINGBUF_DISCARD_BIT) { /* If the discard bit is set, the sample should be skipped. * * Update the consumer pos, and return -EAGAIN so the caller * knows to skip this sample and try to read the next one. */ smp_store_release(&rb->consumer_pos, cons_pos + total_len); return -EAGAIN; } if (flags & BPF_RINGBUF_BUSY_BIT) return -ENODATA; *sample = (void *)((uintptr_t)rb->data + (uintptr_t)((cons_pos + BPF_RINGBUF_HDR_SZ) & rb->mask)); *size = sample_len; return 0; } static void __bpf_user_ringbuf_sample_release(struct bpf_ringbuf *rb, size_t size, u64 flags) { u64 consumer_pos; u32 rounded_size = round_up(size + BPF_RINGBUF_HDR_SZ, 8); /* Using smp_load_acquire() is unnecessary here, as the busy-bit * prevents another task from writing to consumer_pos after it was read * by this task with smp_load_acquire() in __bpf_user_ringbuf_peek(). */ consumer_pos = rb->consumer_pos; /* Synchronizes with smp_load_acquire() in user-space producer. */ smp_store_release(&rb->consumer_pos, consumer_pos + rounded_size); } BPF_CALL_4(bpf_user_ringbuf_drain, struct bpf_map *, map, void *, callback_fn, void *, callback_ctx, u64, flags) { struct bpf_ringbuf *rb; long samples, discarded_samples = 0, ret = 0; bpf_callback_t callback = (bpf_callback_t)callback_fn; u64 wakeup_flags = BPF_RB_NO_WAKEUP | BPF_RB_FORCE_WAKEUP; int busy = 0; if (unlikely(flags & ~wakeup_flags)) return -EINVAL; rb = container_of(map, struct bpf_ringbuf_map, map)->rb; /* If another consumer is already consuming a sample, wait for them to finish. */ if (!atomic_try_cmpxchg(&rb->busy, &busy, 1)) return -EBUSY; for (samples = 0; samples < BPF_MAX_USER_RINGBUF_SAMPLES && ret == 0; samples++) { int err; u32 size; void *sample; struct bpf_dynptr_kern dynptr; err = __bpf_user_ringbuf_peek(rb, &sample, &size); if (err) { if (err == -ENODATA) { break; } else if (err == -EAGAIN) { discarded_samples++; continue; } else { ret = err; goto schedule_work_return; } } bpf_dynptr_init(&dynptr, sample, BPF_DYNPTR_TYPE_LOCAL, 0, size); ret = callback((uintptr_t)&dynptr, (uintptr_t)callback_ctx, 0, 0, 0); __bpf_user_ringbuf_sample_release(rb, size, flags); } ret = samples - discarded_samples; schedule_work_return: /* Prevent the clearing of the busy-bit from being reordered before the * storing of any rb consumer or producer positions. */ atomic_set_release(&rb->busy, 0); if (flags & BPF_RB_FORCE_WAKEUP) irq_work_queue(&rb->work); else if (!(flags & BPF_RB_NO_WAKEUP) && samples > 0) irq_work_queue(&rb->work); return ret; } const struct bpf_func_proto bpf_user_ringbuf_drain_proto = { .func = bpf_user_ringbuf_drain, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_FUNC, .arg3_type = ARG_PTR_TO_STACK_OR_NULL, .arg4_type = ARG_ANYTHING, };
110 1 110 2 144 146 146 1 81 61 1 1 1 140 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hfsplus/wrapper.c * * Copyright (C) 2001 * Brad Boyer (flar@allandria.com) * (C) 2003 Ardis Technologies <roman@ardistech.com> * * Handling of HFS wrappers around HFS+ volumes */ #include <linux/fs.h> #include <linux/blkdev.h> #include <linux/cdrom.h> #include <linux/unaligned.h> #include "hfsplus_fs.h" #include "hfsplus_raw.h" struct hfsplus_wd { u32 ablk_size; u16 ablk_start; u16 embed_start; u16 embed_count; }; /** * hfsplus_submit_bio - Perform block I/O * @sb: super block of volume for I/O * @sector: block to read or write, for blocks of HFSPLUS_SECTOR_SIZE bytes * @buf: buffer for I/O * @data: output pointer for location of requested data * @opf: I/O operation type and flags * * The unit of I/O is hfsplus_min_io_size(sb), which may be bigger than * HFSPLUS_SECTOR_SIZE, and @buf must be sized accordingly. On reads * @data will return a pointer to the start of the requested sector, * which may not be the same location as @buf. * * If @sector is not aligned to the bdev logical block size it will * be rounded down. For writes this means that @buf should contain data * that starts at the rounded-down address. As long as the data was * read using hfsplus_submit_bio() and the same buffer is used things * will work correctly. * * Returns: %0 on success else -errno code */ int hfsplus_submit_bio(struct super_block *sb, sector_t sector, void *buf, void **data, blk_opf_t opf) { u64 io_size = hfsplus_min_io_size(sb); loff_t start = (loff_t)sector << HFSPLUS_SECTOR_SHIFT; int offset = start & (io_size - 1); if ((opf & REQ_OP_MASK) != REQ_OP_WRITE && data) *data = (u8 *)buf + offset; /* * Align sector to hardware sector size and find offset. We assume that * io_size is a power of two, which _should_ be true. */ sector &= ~((io_size >> HFSPLUS_SECTOR_SHIFT) - 1); return bdev_rw_virt(sb->s_bdev, sector, buf, io_size, opf); } static int hfsplus_read_mdb(void *bufptr, struct hfsplus_wd *wd) { u32 extent; u16 attrib; __be16 sig; sig = *(__be16 *)(bufptr + HFSP_WRAPOFF_EMBEDSIG); if (sig != cpu_to_be16(HFSPLUS_VOLHEAD_SIG) && sig != cpu_to_be16(HFSPLUS_VOLHEAD_SIGX)) return 0; attrib = be16_to_cpu(*(__be16 *)(bufptr + HFSP_WRAPOFF_ATTRIB)); if (!(attrib & HFSP_WRAP_ATTRIB_SLOCK) || !(attrib & HFSP_WRAP_ATTRIB_SPARED)) return 0; wd->ablk_size = be32_to_cpu(*(__be32 *)(bufptr + HFSP_WRAPOFF_ABLKSIZE)); if (wd->ablk_size < HFSPLUS_SECTOR_SIZE) return 0; if (wd->ablk_size % HFSPLUS_SECTOR_SIZE) return 0; wd->ablk_start = be16_to_cpu(*(__be16 *)(bufptr + HFSP_WRAPOFF_ABLKSTART)); extent = get_unaligned_be32(bufptr + HFSP_WRAPOFF_EMBEDEXT); wd->embed_start = (extent >> 16) & 0xFFFF; wd->embed_count = extent & 0xFFFF; return 1; } static int hfsplus_get_last_session(struct super_block *sb, sector_t *start, sector_t *size) { struct cdrom_device_info *cdi = disk_to_cdi(sb->s_bdev->bd_disk); /* default values */ *start = 0; *size = bdev_nr_sectors(sb->s_bdev); if (HFSPLUS_SB(sb)->session >= 0) { struct cdrom_tocentry te; if (!cdi) return -EINVAL; te.cdte_track = HFSPLUS_SB(sb)->session; te.cdte_format = CDROM_LBA; if (cdrom_read_tocentry(cdi, &te) || (te.cdte_ctrl & CDROM_DATA_TRACK) != 4) { pr_err("invalid session number or type of track\n"); return -EINVAL; } *start = (sector_t)te.cdte_addr.lba << 2; } else if (cdi) { struct cdrom_multisession ms_info; ms_info.addr_format = CDROM_LBA; if (cdrom_multisession(cdi, &ms_info) == 0 && ms_info.xa_flag) *start = (sector_t)ms_info.addr.lba << 2; } return 0; } /* Find the volume header and fill in some minimum bits in superblock */ /* Takes in super block, returns true if good data read */ int hfsplus_read_wrapper(struct super_block *sb) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); struct hfsplus_wd wd; sector_t part_start, part_size; u32 blocksize; int error = 0; error = -EINVAL; blocksize = sb_min_blocksize(sb, HFSPLUS_SECTOR_SIZE); if (!blocksize) goto out; sbi->min_io_size = blocksize; if (hfsplus_get_last_session(sb, &part_start, &part_size)) goto out; error = -ENOMEM; sbi->s_vhdr_buf = kmalloc(hfsplus_min_io_size(sb), GFP_KERNEL); if (!sbi->s_vhdr_buf) goto out; sbi->s_backup_vhdr_buf = kmalloc(hfsplus_min_io_size(sb), GFP_KERNEL); if (!sbi->s_backup_vhdr_buf) goto out_free_vhdr; reread: error = hfsplus_submit_bio(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, sbi->s_vhdr_buf, (void **)&sbi->s_vhdr, REQ_OP_READ); if (error) goto out_free_backup_vhdr; error = -EINVAL; switch (sbi->s_vhdr->signature) { case cpu_to_be16(HFSPLUS_VOLHEAD_SIGX): set_bit(HFSPLUS_SB_HFSX, &sbi->flags); fallthrough; case cpu_to_be16(HFSPLUS_VOLHEAD_SIG): break; case cpu_to_be16(HFSP_WRAP_MAGIC): if (!hfsplus_read_mdb(sbi->s_vhdr, &wd)) goto out_free_backup_vhdr; wd.ablk_size >>= HFSPLUS_SECTOR_SHIFT; part_start += (sector_t)wd.ablk_start + (sector_t)wd.embed_start * wd.ablk_size; part_size = (sector_t)wd.embed_count * wd.ablk_size; goto reread; default: /* * Check for a partition block. * * (should do this only for cdrom/loop though) */ if (hfs_part_find(sb, &part_start, &part_size)) goto out_free_backup_vhdr; goto reread; } error = hfsplus_submit_bio(sb, part_start + part_size - 2, sbi->s_backup_vhdr_buf, (void **)&sbi->s_backup_vhdr, REQ_OP_READ); if (error) goto out_free_backup_vhdr; error = -EINVAL; if (sbi->s_backup_vhdr->signature != sbi->s_vhdr->signature) { pr_warn("invalid secondary volume header\n"); goto out_free_backup_vhdr; } blocksize = be32_to_cpu(sbi->s_vhdr->blocksize); /* * Block size must be at least as large as a sector and a multiple of 2. */ if (blocksize < HFSPLUS_SECTOR_SIZE || ((blocksize - 1) & blocksize)) goto out_free_backup_vhdr; sbi->alloc_blksz = blocksize; sbi->alloc_blksz_shift = ilog2(blocksize); blocksize = min_t(u32, sbi->alloc_blksz, PAGE_SIZE); /* * Align block size to block offset. */ while (part_start & ((blocksize >> HFSPLUS_SECTOR_SHIFT) - 1)) blocksize >>= 1; if (sb_set_blocksize(sb, blocksize) != blocksize) { pr_err("unable to set blocksize to %u!\n", blocksize); goto out_free_backup_vhdr; } sbi->blockoffset = part_start >> (sb->s_blocksize_bits - HFSPLUS_SECTOR_SHIFT); sbi->part_start = part_start; sbi->sect_count = part_size; sbi->fs_shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits; return 0; out_free_backup_vhdr: kfree(sbi->s_backup_vhdr_buf); out_free_vhdr: kfree(sbi->s_vhdr_buf); out: return error; }
94 74 37 16534 541 2911 480 32 26 38 30 2317 215 49 50 2252 5689 496 2465 2412 18 18 2018 2038 295 15154 1 1 4 10 576 6 34 8 1268 46 38 421 855 350 276 29 172 297 6 10 1 327 43 9 84 56 5 82 109 4 4 4 82 4 187 12 4 6 198 181 1 303 304 17715 19592 62 175 1169 62 1459 224 19591 43 377 17711 50 17748 222 17411 62 62 62 62 130 9 269 22 9 268 4 1420 313 152 275 15957 274 108 1232 872 872 1 905 906 71 92 101 14 1382 12 5 68 16177 123 97 27 156 4 16218 1290 271 94 1321 260 64 941 110 814 78 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the Interfaces handler. * * Version: @(#)dev.h 1.0.10 08/12/93 * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Corey Minyard <wf-rch!minyard@relay.EU.net> * Donald J. Becker, <becker@cesdis.gsfc.nasa.gov> * Alan Cox, <alan@lxorguk.ukuu.org.uk> * Bjorn Ekwall. <bj0rn@blox.se> * Pekka Riikonen <priikone@poseidon.pspt.fi> * * Moved to /usr/include/linux for NET3 */ #ifndef _LINUX_NETDEVICE_H #define _LINUX_NETDEVICE_H #include <linux/timer.h> #include <linux/bug.h> #include <linux/delay.h> #include <linux/atomic.h> #include <linux/prefetch.h> #include <asm/cache.h> #include <asm/byteorder.h> #include <asm/local.h> #include <linux/percpu.h> #include <linux/rculist.h> #include <linux/workqueue.h> #include <linux/dynamic_queue_limits.h> #include <net/net_namespace.h> #ifdef CONFIG_DCB #include <net/dcbnl.h> #endif #include <net/netprio_cgroup.h> #include <linux/netdev_features.h> #include <linux/neighbour.h> #include <linux/netdevice_xmit.h> #include <uapi/linux/netdevice.h> #include <uapi/linux/if_bonding.h> #include <uapi/linux/pkt_cls.h> #include <uapi/linux/netdev.h> #include <linux/hashtable.h> #include <linux/rbtree.h> #include <net/net_trackers.h> #include <net/net_debug.h> #include <net/dropreason-core.h> #include <net/neighbour_tables.h> struct netpoll_info; struct device; struct ethtool_ops; struct kernel_hwtstamp_config; struct phy_device; struct dsa_port; struct ip_tunnel_parm_kern; struct macsec_context; struct macsec_ops; struct netdev_config; struct netdev_name_node; struct sd_flow_limit; struct sfp_bus; /* 802.11 specific */ struct wireless_dev; /* 802.15.4 specific */ struct wpan_dev; struct mpls_dev; /* UDP Tunnel offloads */ struct udp_tunnel_info; struct udp_tunnel_nic_info; struct udp_tunnel_nic; struct bpf_prog; struct xdp_buff; struct xdp_frame; struct xdp_metadata_ops; struct xdp_md; struct ethtool_netdev_state; struct phy_link_topology; struct hwtstamp_provider; typedef u32 xdp_features_t; void synchronize_net(void); void netdev_set_default_ethtool_ops(struct net_device *dev, const struct ethtool_ops *ops); void netdev_sw_irq_coalesce_default_on(struct net_device *dev); /* Backlog congestion levels */ #define NET_RX_SUCCESS 0 /* keep 'em coming, baby */ #define NET_RX_DROP 1 /* packet dropped */ #define MAX_NEST_DEV 8 /* * Transmit return codes: transmit return codes originate from three different * namespaces: * * - qdisc return codes * - driver transmit return codes * - errno values * * Drivers are allowed to return any one of those in their hard_start_xmit() * function. Real network devices commonly used with qdiscs should only return * the driver transmit return codes though - when qdiscs are used, the actual * transmission happens asynchronously, so the value is not propagated to * higher layers. Virtual network devices transmit synchronously; in this case * the driver transmit return codes are consumed by dev_queue_xmit(), and all * others are propagated to higher layers. */ /* qdisc ->enqueue() return codes. */ #define NET_XMIT_SUCCESS 0x00 #define NET_XMIT_DROP 0x01 /* skb dropped */ #define NET_XMIT_CN 0x02 /* congestion notification */ #define NET_XMIT_MASK 0x0f /* qdisc flags in net/sch_generic.h */ /* NET_XMIT_CN is special. It does not guarantee that this packet is lost. It * indicates that the device will soon be dropping packets, or already drops * some packets of the same priority; prompting us to send less aggressively. */ #define net_xmit_eval(e) ((e) == NET_XMIT_CN ? 0 : (e)) #define net_xmit_errno(e) ((e) != NET_XMIT_CN ? -ENOBUFS : 0) /* Driver transmit return codes */ #define NETDEV_TX_MASK 0xf0 enum netdev_tx { __NETDEV_TX_MIN = INT_MIN, /* make sure enum is signed */ NETDEV_TX_OK = 0x00, /* driver took care of packet */ NETDEV_TX_BUSY = 0x10, /* driver tx path was busy*/ }; typedef enum netdev_tx netdev_tx_t; /* * Current order: NETDEV_TX_MASK > NET_XMIT_MASK >= 0 is significant; * hard_start_xmit() return < NET_XMIT_MASK means skb was consumed. */ static inline bool dev_xmit_complete(int rc) { /* * Positive cases with an skb consumed by a driver: * - successful transmission (rc == NETDEV_TX_OK) * - error while transmitting (rc < 0) * - error while queueing to a different device (rc & NET_XMIT_MASK) */ if (likely(rc < NET_XMIT_MASK)) return true; return false; } /* * Compute the worst-case header length according to the protocols * used. */ #if defined(CONFIG_HYPERV_NET) # define LL_MAX_HEADER 128 #elif defined(CONFIG_WLAN) || IS_ENABLED(CONFIG_AX25) # if defined(CONFIG_MAC80211_MESH) # define LL_MAX_HEADER 128 # else # define LL_MAX_HEADER 96 # endif #else # define LL_MAX_HEADER 32 #endif #if !IS_ENABLED(CONFIG_NET_IPIP) && !IS_ENABLED(CONFIG_NET_IPGRE) && \ !IS_ENABLED(CONFIG_IPV6_SIT) && !IS_ENABLED(CONFIG_IPV6_TUNNEL) #define MAX_HEADER LL_MAX_HEADER #else #define MAX_HEADER (LL_MAX_HEADER + 48) #endif /* * Old network device statistics. Fields are native words * (unsigned long) so they can be read and written atomically. */ #define NET_DEV_STAT(FIELD) \ union { \ unsigned long FIELD; \ atomic_long_t __##FIELD; \ } struct net_device_stats { NET_DEV_STAT(rx_packets); NET_DEV_STAT(tx_packets); NET_DEV_STAT(rx_bytes); NET_DEV_STAT(tx_bytes); NET_DEV_STAT(rx_errors); NET_DEV_STAT(tx_errors); NET_DEV_STAT(rx_dropped); NET_DEV_STAT(tx_dropped); NET_DEV_STAT(multicast); NET_DEV_STAT(collisions); NET_DEV_STAT(rx_length_errors); NET_DEV_STAT(rx_over_errors); NET_DEV_STAT(rx_crc_errors); NET_DEV_STAT(rx_frame_errors); NET_DEV_STAT(rx_fifo_errors); NET_DEV_STAT(rx_missed_errors); NET_DEV_STAT(tx_aborted_errors); NET_DEV_STAT(tx_carrier_errors); NET_DEV_STAT(tx_fifo_errors); NET_DEV_STAT(tx_heartbeat_errors); NET_DEV_STAT(tx_window_errors); NET_DEV_STAT(rx_compressed); NET_DEV_STAT(tx_compressed); }; #undef NET_DEV_STAT /* per-cpu stats, allocated on demand. * Try to fit them in a single cache line, for dev_get_stats() sake. */ struct net_device_core_stats { unsigned long rx_dropped; unsigned long tx_dropped; unsigned long rx_nohandler; unsigned long rx_otherhost_dropped; } __aligned(4 * sizeof(unsigned long)); #include <linux/cache.h> #include <linux/skbuff.h> struct neighbour; struct neigh_parms; struct sk_buff; struct netdev_hw_addr { struct list_head list; struct rb_node node; unsigned char addr[MAX_ADDR_LEN]; unsigned char type; #define NETDEV_HW_ADDR_T_LAN 1 #define NETDEV_HW_ADDR_T_SAN 2 #define NETDEV_HW_ADDR_T_UNICAST 3 #define NETDEV_HW_ADDR_T_MULTICAST 4 bool global_use; int sync_cnt; int refcount; int synced; struct rcu_head rcu_head; }; struct netdev_hw_addr_list { struct list_head list; int count; /* Auxiliary tree for faster lookup on addition and deletion */ struct rb_root tree; }; #define netdev_hw_addr_list_count(l) ((l)->count) #define netdev_hw_addr_list_empty(l) (netdev_hw_addr_list_count(l) == 0) #define netdev_hw_addr_list_for_each(ha, l) \ list_for_each_entry(ha, &(l)->list, list) #define netdev_uc_count(dev) netdev_hw_addr_list_count(&(dev)->uc) #define netdev_uc_empty(dev) netdev_hw_addr_list_empty(&(dev)->uc) #define netdev_for_each_uc_addr(ha, dev) \ netdev_hw_addr_list_for_each(ha, &(dev)->uc) #define netdev_for_each_synced_uc_addr(_ha, _dev) \ netdev_for_each_uc_addr((_ha), (_dev)) \ if ((_ha)->sync_cnt) #define netdev_mc_count(dev) netdev_hw_addr_list_count(&(dev)->mc) #define netdev_mc_empty(dev) netdev_hw_addr_list_empty(&(dev)->mc) #define netdev_for_each_mc_addr(ha, dev) \ netdev_hw_addr_list_for_each(ha, &(dev)->mc) #define netdev_for_each_synced_mc_addr(_ha, _dev) \ netdev_for_each_mc_addr((_ha), (_dev)) \ if ((_ha)->sync_cnt) struct hh_cache { unsigned int hh_len; seqlock_t hh_lock; /* cached hardware header; allow for machine alignment needs. */ #define HH_DATA_MOD 16 #define HH_DATA_OFF(__len) \ (HH_DATA_MOD - (((__len - 1) & (HH_DATA_MOD - 1)) + 1)) #define HH_DATA_ALIGN(__len) \ (((__len)+(HH_DATA_MOD-1))&~(HH_DATA_MOD - 1)) unsigned long hh_data[HH_DATA_ALIGN(LL_MAX_HEADER) / sizeof(long)]; }; /* Reserve HH_DATA_MOD byte-aligned hard_header_len, but at least that much. * Alternative is: * dev->hard_header_len ? (dev->hard_header_len + * (HH_DATA_MOD - 1)) & ~(HH_DATA_MOD - 1) : 0 * * We could use other alignment values, but we must maintain the * relationship HH alignment <= LL alignment. */ #define LL_RESERVED_SPACE(dev) \ ((((dev)->hard_header_len + READ_ONCE((dev)->needed_headroom)) \ & ~(HH_DATA_MOD - 1)) + HH_DATA_MOD) #define LL_RESERVED_SPACE_EXTRA(dev,extra) \ ((((dev)->hard_header_len + READ_ONCE((dev)->needed_headroom) + (extra)) \ & ~(HH_DATA_MOD - 1)) + HH_DATA_MOD) struct header_ops { int (*create) (struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned int len); int (*parse)(const struct sk_buff *skb, unsigned char *haddr); int (*cache)(const struct neighbour *neigh, struct hh_cache *hh, __be16 type); void (*cache_update)(struct hh_cache *hh, const struct net_device *dev, const unsigned char *haddr); bool (*validate)(const char *ll_header, unsigned int len); __be16 (*parse_protocol)(const struct sk_buff *skb); }; /* These flag bits are private to the generic network queueing * layer; they may not be explicitly referenced by any other * code. */ enum netdev_state_t { __LINK_STATE_START, __LINK_STATE_PRESENT, __LINK_STATE_NOCARRIER, __LINK_STATE_LINKWATCH_PENDING, __LINK_STATE_DORMANT, __LINK_STATE_TESTING, }; struct gro_list { struct list_head list; int count; }; /* * size of gro hash buckets, must be <= the number of bits in * gro_node::bitmask */ #define GRO_HASH_BUCKETS 8 /** * struct gro_node - structure to support Generic Receive Offload * @bitmask: bitmask to indicate used buckets in @hash * @hash: hashtable of pending aggregated skbs, separated by flows * @rx_list: list of pending ``GRO_NORMAL`` skbs * @rx_count: cached current length of @rx_list * @cached_napi_id: napi_struct::napi_id cached for hotpath, 0 for standalone */ struct gro_node { unsigned long bitmask; struct gro_list hash[GRO_HASH_BUCKETS]; struct list_head rx_list; u32 rx_count; u32 cached_napi_id; }; /* * Structure for per-NAPI config */ struct napi_config { u64 gro_flush_timeout; u64 irq_suspend_timeout; u32 defer_hard_irqs; cpumask_t affinity_mask; u8 threaded; unsigned int napi_id; }; /* * Structure for NAPI scheduling similar to tasklet but with weighting */ struct napi_struct { /* The poll_list must only be managed by the entity which * changes the state of the NAPI_STATE_SCHED bit. This means * whoever atomically sets that bit can add this napi_struct * to the per-CPU poll_list, and whoever clears that bit * can remove from the list right before clearing the bit. */ struct list_head poll_list; unsigned long state; int weight; u32 defer_hard_irqs_count; int (*poll)(struct napi_struct *, int); #ifdef CONFIG_NETPOLL /* CPU actively polling if netpoll is configured */ int poll_owner; #endif /* CPU on which NAPI has been scheduled for processing */ int list_owner; struct net_device *dev; struct sk_buff *skb; struct gro_node gro; struct hrtimer timer; /* all fields past this point are write-protected by netdev_lock */ struct task_struct *thread; unsigned long gro_flush_timeout; unsigned long irq_suspend_timeout; u32 defer_hard_irqs; /* control-path-only fields follow */ u32 napi_id; struct list_head dev_list; struct hlist_node napi_hash_node; int irq; struct irq_affinity_notify notify; int napi_rmap_idx; int index; struct napi_config *config; }; enum { NAPI_STATE_SCHED, /* Poll is scheduled */ NAPI_STATE_MISSED, /* reschedule a napi */ NAPI_STATE_DISABLE, /* Disable pending */ NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */ NAPI_STATE_LISTED, /* NAPI added to system lists */ NAPI_STATE_NO_BUSY_POLL, /* Do not add in napi_hash, no busy polling */ NAPI_STATE_IN_BUSY_POLL, /* sk_busy_loop() owns this NAPI */ NAPI_STATE_PREFER_BUSY_POLL, /* prefer busy-polling over softirq processing*/ NAPI_STATE_THREADED, /* The poll is performed inside its own thread*/ NAPI_STATE_SCHED_THREADED, /* Napi is currently scheduled in threaded mode */ NAPI_STATE_HAS_NOTIFIER, /* Napi has an IRQ notifier */ }; enum { NAPIF_STATE_SCHED = BIT(NAPI_STATE_SCHED), NAPIF_STATE_MISSED = BIT(NAPI_STATE_MISSED), NAPIF_STATE_DISABLE = BIT(NAPI_STATE_DISABLE), NAPIF_STATE_NPSVC = BIT(NAPI_STATE_NPSVC), NAPIF_STATE_LISTED = BIT(NAPI_STATE_LISTED), NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL), NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL), NAPIF_STATE_PREFER_BUSY_POLL = BIT(NAPI_STATE_PREFER_BUSY_POLL), NAPIF_STATE_THREADED = BIT(NAPI_STATE_THREADED), NAPIF_STATE_SCHED_THREADED = BIT(NAPI_STATE_SCHED_THREADED), NAPIF_STATE_HAS_NOTIFIER = BIT(NAPI_STATE_HAS_NOTIFIER), }; enum gro_result { GRO_MERGED, GRO_MERGED_FREE, GRO_HELD, GRO_NORMAL, GRO_CONSUMED, }; typedef enum gro_result gro_result_t; /* * enum rx_handler_result - Possible return values for rx_handlers. * @RX_HANDLER_CONSUMED: skb was consumed by rx_handler, do not process it * further. * @RX_HANDLER_ANOTHER: Do another round in receive path. This is indicated in * case skb->dev was changed by rx_handler. * @RX_HANDLER_EXACT: Force exact delivery, no wildcard. * @RX_HANDLER_PASS: Do nothing, pass the skb as if no rx_handler was called. * * rx_handlers are functions called from inside __netif_receive_skb(), to do * special processing of the skb, prior to delivery to protocol handlers. * * Currently, a net_device can only have a single rx_handler registered. Trying * to register a second rx_handler will return -EBUSY. * * To register a rx_handler on a net_device, use netdev_rx_handler_register(). * To unregister a rx_handler on a net_device, use * netdev_rx_handler_unregister(). * * Upon return, rx_handler is expected to tell __netif_receive_skb() what to * do with the skb. * * If the rx_handler consumed the skb in some way, it should return * RX_HANDLER_CONSUMED. This is appropriate when the rx_handler arranged for * the skb to be delivered in some other way. * * If the rx_handler changed skb->dev, to divert the skb to another * net_device, it should return RX_HANDLER_ANOTHER. The rx_handler for the * new device will be called if it exists. * * If the rx_handler decides the skb should be ignored, it should return * RX_HANDLER_EXACT. The skb will only be delivered to protocol handlers that * are registered on exact device (ptype->dev == skb->dev). * * If the rx_handler didn't change skb->dev, but wants the skb to be normally * delivered, it should return RX_HANDLER_PASS. * * A device without a registered rx_handler will behave as if rx_handler * returned RX_HANDLER_PASS. */ enum rx_handler_result { RX_HANDLER_CONSUMED, RX_HANDLER_ANOTHER, RX_HANDLER_EXACT, RX_HANDLER_PASS, }; typedef enum rx_handler_result rx_handler_result_t; typedef rx_handler_result_t rx_handler_func_t(struct sk_buff **pskb); void __napi_schedule(struct napi_struct *n); void __napi_schedule_irqoff(struct napi_struct *n); static inline bool napi_disable_pending(struct napi_struct *n) { return test_bit(NAPI_STATE_DISABLE, &n->state); } static inline bool napi_prefer_busy_poll(struct napi_struct *n) { return test_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state); } /** * napi_is_scheduled - test if NAPI is scheduled * @n: NAPI context * * This check is "best-effort". With no locking implemented, * a NAPI can be scheduled or terminate right after this check * and produce not precise results. * * NAPI_STATE_SCHED is an internal state, napi_is_scheduled * should not be used normally and napi_schedule should be * used instead. * * Use only if the driver really needs to check if a NAPI * is scheduled for example in the context of delayed timer * that can be skipped if a NAPI is already scheduled. * * Return: True if NAPI is scheduled, False otherwise. */ static inline bool napi_is_scheduled(struct napi_struct *n) { return test_bit(NAPI_STATE_SCHED, &n->state); } bool napi_schedule_prep(struct napi_struct *n); /** * napi_schedule - schedule NAPI poll * @n: NAPI context * * Schedule NAPI poll routine to be called if it is not already * running. * Return: true if we schedule a NAPI or false if not. * Refer to napi_schedule_prep() for additional reason on why * a NAPI might not be scheduled. */ static inline bool napi_schedule(struct napi_struct *n) { if (napi_schedule_prep(n)) { __napi_schedule(n); return true; } return false; } /** * napi_schedule_irqoff - schedule NAPI poll * @n: NAPI context * * Variant of napi_schedule(), assuming hard irqs are masked. */ static inline void napi_schedule_irqoff(struct napi_struct *n) { if (napi_schedule_prep(n)) __napi_schedule_irqoff(n); } /** * napi_complete_done - NAPI processing complete * @n: NAPI context * @work_done: number of packets processed * * Mark NAPI processing as complete. Should only be called if poll budget * has not been completely consumed. * Prefer over napi_complete(). * Return: false if device should avoid rearming interrupts. */ bool napi_complete_done(struct napi_struct *n, int work_done); static inline bool napi_complete(struct napi_struct *n) { return napi_complete_done(n, 0); } void netif_threaded_enable(struct net_device *dev); int dev_set_threaded(struct net_device *dev, enum netdev_napi_threaded threaded); void napi_disable(struct napi_struct *n); void napi_disable_locked(struct napi_struct *n); void napi_enable(struct napi_struct *n); void napi_enable_locked(struct napi_struct *n); /** * napi_synchronize - wait until NAPI is not running * @n: NAPI context * * Wait until NAPI is done being scheduled on this context. * Waits till any outstanding processing completes but * does not disable future activations. */ static inline void napi_synchronize(const struct napi_struct *n) { if (IS_ENABLED(CONFIG_SMP)) while (test_bit(NAPI_STATE_SCHED, &n->state)) msleep(1); else barrier(); } /** * napi_if_scheduled_mark_missed - if napi is running, set the * NAPIF_STATE_MISSED * @n: NAPI context * * If napi is running, set the NAPIF_STATE_MISSED, and return true if * NAPI is scheduled. **/ static inline bool napi_if_scheduled_mark_missed(struct napi_struct *n) { unsigned long val, new; val = READ_ONCE(n->state); do { if (val & NAPIF_STATE_DISABLE) return true; if (!(val & NAPIF_STATE_SCHED)) return false; new = val | NAPIF_STATE_MISSED; } while (!try_cmpxchg(&n->state, &val, new)); return true; } enum netdev_queue_state_t { __QUEUE_STATE_DRV_XOFF, __QUEUE_STATE_STACK_XOFF, __QUEUE_STATE_FROZEN, }; #define QUEUE_STATE_DRV_XOFF (1 << __QUEUE_STATE_DRV_XOFF) #define QUEUE_STATE_STACK_XOFF (1 << __QUEUE_STATE_STACK_XOFF) #define QUEUE_STATE_FROZEN (1 << __QUEUE_STATE_FROZEN) #define QUEUE_STATE_ANY_XOFF (QUEUE_STATE_DRV_XOFF | QUEUE_STATE_STACK_XOFF) #define QUEUE_STATE_ANY_XOFF_OR_FROZEN (QUEUE_STATE_ANY_XOFF | \ QUEUE_STATE_FROZEN) #define QUEUE_STATE_DRV_XOFF_OR_FROZEN (QUEUE_STATE_DRV_XOFF | \ QUEUE_STATE_FROZEN) /* * __QUEUE_STATE_DRV_XOFF is used by drivers to stop the transmit queue. The * netif_tx_* functions below are used to manipulate this flag. The * __QUEUE_STATE_STACK_XOFF flag is used by the stack to stop the transmit * queue independently. The netif_xmit_*stopped functions below are called * to check if the queue has been stopped by the driver or stack (either * of the XOFF bits are set in the state). Drivers should not need to call * netif_xmit*stopped functions, they should only be using netif_tx_*. */ struct netdev_queue { /* * read-mostly part */ struct net_device *dev; netdevice_tracker dev_tracker; struct Qdisc __rcu *qdisc; struct Qdisc __rcu *qdisc_sleeping; #ifdef CONFIG_SYSFS struct kobject kobj; const struct attribute_group **groups; #endif unsigned long tx_maxrate; /* * Number of TX timeouts for this queue * (/sys/class/net/DEV/Q/trans_timeout) */ atomic_long_t trans_timeout; /* Subordinate device that the queue has been assigned to */ struct net_device *sb_dev; #ifdef CONFIG_XDP_SOCKETS /* "ops protected", see comment about net_device::lock */ struct xsk_buff_pool *pool; #endif /* * write-mostly part */ #ifdef CONFIG_BQL struct dql dql; #endif spinlock_t _xmit_lock ____cacheline_aligned_in_smp; int xmit_lock_owner; /* * Time (in jiffies) of last Tx */ unsigned long trans_start; unsigned long state; /* * slow- / control-path part */ /* NAPI instance for the queue * "ops protected", see comment about net_device::lock */ struct napi_struct *napi; #if defined(CONFIG_XPS) && defined(CONFIG_NUMA) int numa_node; #endif } ____cacheline_aligned_in_smp; extern int sysctl_fb_tunnels_only_for_init_net; extern int sysctl_devconf_inherit_init_net; /* * sysctl_fb_tunnels_only_for_init_net == 0 : For all netns * == 1 : For initns only * == 2 : For none. */ static inline bool net_has_fallback_tunnels(const struct net *net) { #if IS_ENABLED(CONFIG_SYSCTL) int fb_tunnels_only_for_init_net = READ_ONCE(sysctl_fb_tunnels_only_for_init_net); return !fb_tunnels_only_for_init_net || (net_eq(net, &init_net) && fb_tunnels_only_for_init_net == 1); #else return true; #endif } static inline int net_inherit_devconf(void) { #if IS_ENABLED(CONFIG_SYSCTL) return READ_ONCE(sysctl_devconf_inherit_init_net); #else return 0; #endif } static inline int netdev_queue_numa_node_read(const struct netdev_queue *q) { #if defined(CONFIG_XPS) && defined(CONFIG_NUMA) return q->numa_node; #else return NUMA_NO_NODE; #endif } static inline void netdev_queue_numa_node_write(struct netdev_queue *q, int node) { #if defined(CONFIG_XPS) && defined(CONFIG_NUMA) q->numa_node = node; #endif } #ifdef CONFIG_RFS_ACCEL bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, u32 flow_id, u16 filter_id); #endif /* XPS map type and offset of the xps map within net_device->xps_maps[]. */ enum xps_map_type { XPS_CPUS = 0, XPS_RXQS, XPS_MAPS_MAX, }; #ifdef CONFIG_XPS /* * This structure holds an XPS map which can be of variable length. The * map is an array of queues. */ struct xps_map { unsigned int len; unsigned int alloc_len; struct rcu_head rcu; u16 queues[]; }; #define XPS_MAP_SIZE(_num) (sizeof(struct xps_map) + ((_num) * sizeof(u16))) #define XPS_MIN_MAP_ALLOC ((L1_CACHE_ALIGN(offsetof(struct xps_map, queues[1])) \ - sizeof(struct xps_map)) / sizeof(u16)) /* * This structure holds all XPS maps for device. Maps are indexed by CPU. * * We keep track of the number of cpus/rxqs used when the struct is allocated, * in nr_ids. This will help not accessing out-of-bound memory. * * We keep track of the number of traffic classes used when the struct is * allocated, in num_tc. This will be used to navigate the maps, to ensure we're * not crossing its upper bound, as the original dev->num_tc can be updated in * the meantime. */ struct xps_dev_maps { struct rcu_head rcu; unsigned int nr_ids; s16 num_tc; struct xps_map __rcu *attr_map[]; /* Either CPUs map or RXQs map */ }; #define XPS_CPU_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) + \ (nr_cpu_ids * (_tcs) * sizeof(struct xps_map *))) #define XPS_RXQ_DEV_MAPS_SIZE(_tcs, _rxqs) (sizeof(struct xps_dev_maps) +\ (_rxqs * (_tcs) * sizeof(struct xps_map *))) #endif /* CONFIG_XPS */ #define TC_MAX_QUEUE 16 #define TC_BITMASK 15 /* HW offloaded queuing disciplines txq count and offset maps */ struct netdev_tc_txq { u16 count; u16 offset; }; #if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE) /* * This structure is to hold information about the device * configured to run FCoE protocol stack. */ struct netdev_fcoe_hbainfo { char manufacturer[64]; char serial_number[64]; char hardware_version[64]; char driver_version[64]; char optionrom_version[64]; char firmware_version[64]; char model[256]; char model_description[256]; }; #endif #define MAX_PHYS_ITEM_ID_LEN 32 /* This structure holds a unique identifier to identify some * physical item (port for example) used by a netdevice. */ struct netdev_phys_item_id { unsigned char id[MAX_PHYS_ITEM_ID_LEN]; unsigned char id_len; }; static inline bool netdev_phys_item_id_same(struct netdev_phys_item_id *a, struct netdev_phys_item_id *b) { return a->id_len == b->id_len && memcmp(a->id, b->id, a->id_len) == 0; } typedef u16 (*select_queue_fallback_t)(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev); enum net_device_path_type { DEV_PATH_ETHERNET = 0, DEV_PATH_VLAN, DEV_PATH_BRIDGE, DEV_PATH_PPPOE, DEV_PATH_DSA, DEV_PATH_MTK_WDMA, }; struct net_device_path { enum net_device_path_type type; const struct net_device *dev; union { struct { u16 id; __be16 proto; u8 h_dest[ETH_ALEN]; } encap; struct { enum { DEV_PATH_BR_VLAN_KEEP, DEV_PATH_BR_VLAN_TAG, DEV_PATH_BR_VLAN_UNTAG, DEV_PATH_BR_VLAN_UNTAG_HW, } vlan_mode; u16 vlan_id; __be16 vlan_proto; } bridge; struct { int port; u16 proto; } dsa; struct { u8 wdma_idx; u8 queue; u16 wcid; u8 bss; u8 amsdu; } mtk_wdma; }; }; #define NET_DEVICE_PATH_STACK_MAX 5 #define NET_DEVICE_PATH_VLAN_MAX 2 struct net_device_path_stack { int num_paths; struct net_device_path path[NET_DEVICE_PATH_STACK_MAX]; }; struct net_device_path_ctx { const struct net_device *dev; u8 daddr[ETH_ALEN]; int num_vlans; struct { u16 id; __be16 proto; } vlan[NET_DEVICE_PATH_VLAN_MAX]; }; enum tc_setup_type { TC_QUERY_CAPS, TC_SETUP_QDISC_MQPRIO, TC_SETUP_CLSU32, TC_SETUP_CLSFLOWER, TC_SETUP_CLSMATCHALL, TC_SETUP_CLSBPF, TC_SETUP_BLOCK, TC_SETUP_QDISC_CBS, TC_SETUP_QDISC_RED, TC_SETUP_QDISC_PRIO, TC_SETUP_QDISC_MQ, TC_SETUP_QDISC_ETF, TC_SETUP_ROOT_QDISC, TC_SETUP_QDISC_GRED, TC_SETUP_QDISC_TAPRIO, TC_SETUP_FT, TC_SETUP_QDISC_ETS, TC_SETUP_QDISC_TBF, TC_SETUP_QDISC_FIFO, TC_SETUP_QDISC_HTB, TC_SETUP_ACT, }; /* These structures hold the attributes of bpf state that are being passed * to the netdevice through the bpf op. */ enum bpf_netdev_command { /* Set or clear a bpf program used in the earliest stages of packet * rx. The prog will have been loaded as BPF_PROG_TYPE_XDP. The callee * is responsible for calling bpf_prog_put on any old progs that are * stored. In case of error, the callee need not release the new prog * reference, but on success it takes ownership and must bpf_prog_put * when it is no longer used. */ XDP_SETUP_PROG, XDP_SETUP_PROG_HW, /* BPF program for offload callbacks, invoked at program load time. */ BPF_OFFLOAD_MAP_ALLOC, BPF_OFFLOAD_MAP_FREE, XDP_SETUP_XSK_POOL, }; struct bpf_prog_offload_ops; struct netlink_ext_ack; struct xdp_umem; struct xdp_dev_bulk_queue; struct bpf_xdp_link; enum bpf_xdp_mode { XDP_MODE_SKB = 0, XDP_MODE_DRV = 1, XDP_MODE_HW = 2, __MAX_XDP_MODE }; struct bpf_xdp_entity { struct bpf_prog *prog; struct bpf_xdp_link *link; }; struct netdev_bpf { enum bpf_netdev_command command; union { /* XDP_SETUP_PROG */ struct { u32 flags; struct bpf_prog *prog; struct netlink_ext_ack *extack; }; /* BPF_OFFLOAD_MAP_ALLOC, BPF_OFFLOAD_MAP_FREE */ struct { struct bpf_offloaded_map *offmap; }; /* XDP_SETUP_XSK_POOL */ struct { struct xsk_buff_pool *pool; u16 queue_id; } xsk; }; }; /* Flags for ndo_xsk_wakeup. */ #define XDP_WAKEUP_RX (1 << 0) #define XDP_WAKEUP_TX (1 << 1) #ifdef CONFIG_XFRM_OFFLOAD struct xfrmdev_ops { int (*xdo_dev_state_add)(struct net_device *dev, struct xfrm_state *x, struct netlink_ext_ack *extack); void (*xdo_dev_state_delete)(struct net_device *dev, struct xfrm_state *x); void (*xdo_dev_state_free)(struct net_device *dev, struct xfrm_state *x); bool (*xdo_dev_offload_ok) (struct sk_buff *skb, struct xfrm_state *x); void (*xdo_dev_state_advance_esn) (struct xfrm_state *x); void (*xdo_dev_state_update_stats) (struct xfrm_state *x); int (*xdo_dev_policy_add) (struct xfrm_policy *x, struct netlink_ext_ack *extack); void (*xdo_dev_policy_delete) (struct xfrm_policy *x); void (*xdo_dev_policy_free) (struct xfrm_policy *x); }; #endif struct dev_ifalias { struct rcu_head rcuhead; char ifalias[]; }; struct devlink; struct tlsdev_ops; struct netdev_net_notifier { struct list_head list; struct notifier_block *nb; }; /* * This structure defines the management hooks for network devices. * The following hooks can be defined; unless noted otherwise, they are * optional and can be filled with a null pointer. * * int (*ndo_init)(struct net_device *dev); * This function is called once when a network device is registered. * The network device can use this for any late stage initialization * or semantic validation. It can fail with an error code which will * be propagated back to register_netdev. * * void (*ndo_uninit)(struct net_device *dev); * This function is called when device is unregistered or when registration * fails. It is not called if init fails. * * int (*ndo_open)(struct net_device *dev); * This function is called when a network device transitions to the up * state. * * int (*ndo_stop)(struct net_device *dev); * This function is called when a network device transitions to the down * state. * * netdev_tx_t (*ndo_start_xmit)(struct sk_buff *skb, * struct net_device *dev); * Called when a packet needs to be transmitted. * Returns NETDEV_TX_OK. Can return NETDEV_TX_BUSY, but you should stop * the queue before that can happen; it's for obsolete devices and weird * corner cases, but the stack really does a non-trivial amount * of useless work if you return NETDEV_TX_BUSY. * Required; cannot be NULL. * * netdev_features_t (*ndo_features_check)(struct sk_buff *skb, * struct net_device *dev * netdev_features_t features); * Called by core transmit path to determine if device is capable of * performing offload operations on a given packet. This is to give * the device an opportunity to implement any restrictions that cannot * be otherwise expressed by feature flags. The check is called with * the set of features that the stack has calculated and it returns * those the driver believes to be appropriate. * * u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb, * struct net_device *sb_dev); * Called to decide which queue to use when device supports multiple * transmit queues. * * void (*ndo_change_rx_flags)(struct net_device *dev, int flags); * This function is called to allow device receiver to make * changes to configuration when multicast or promiscuous is enabled. * * void (*ndo_set_rx_mode)(struct net_device *dev); * This function is called device changes address list filtering. * If driver handles unicast address filtering, it should set * IFF_UNICAST_FLT in its priv_flags. * * int (*ndo_set_mac_address)(struct net_device *dev, void *addr); * This function is called when the Media Access Control address * needs to be changed. If this interface is not defined, the * MAC address can not be changed. * * int (*ndo_validate_addr)(struct net_device *dev); * Test if Media Access Control address is valid for the device. * * int (*ndo_do_ioctl)(struct net_device *dev, struct ifreq *ifr, int cmd); * Old-style ioctl entry point. This is used internally by the * ieee802154 subsystem but is no longer called by the device * ioctl handler. * * int (*ndo_siocbond)(struct net_device *dev, struct ifreq *ifr, int cmd); * Used by the bonding driver for its device specific ioctls: * SIOCBONDENSLAVE, SIOCBONDRELEASE, SIOCBONDSETHWADDR, SIOCBONDCHANGEACTIVE, * SIOCBONDSLAVEINFOQUERY, and SIOCBONDINFOQUERY * * * int (*ndo_eth_ioctl)(struct net_device *dev, struct ifreq *ifr, int cmd); * Called for ethernet specific ioctls: SIOCGMIIPHY, SIOCGMIIREG, * SIOCSMIIREG, SIOCSHWTSTAMP and SIOCGHWTSTAMP. * * int (*ndo_set_config)(struct net_device *dev, struct ifmap *map); * Used to set network devices bus interface parameters. This interface * is retained for legacy reasons; new devices should use the bus * interface (PCI) for low level management. * * int (*ndo_change_mtu)(struct net_device *dev, int new_mtu); * Called when a user wants to change the Maximum Transfer Unit * of a device. * * void (*ndo_tx_timeout)(struct net_device *dev, unsigned int txqueue); * Callback used when the transmitter has not made any progress * for dev->watchdog ticks. * * void (*ndo_get_stats64)(struct net_device *dev, * struct rtnl_link_stats64 *storage); * struct net_device_stats* (*ndo_get_stats)(struct net_device *dev); * Called when a user wants to get the network device usage * statistics. Drivers must do one of the following: * 1. Define @ndo_get_stats64 to fill in a zero-initialised * rtnl_link_stats64 structure passed by the caller. * 2. Define @ndo_get_stats to update a net_device_stats structure * (which should normally be dev->stats) and return a pointer to * it. The structure may be changed asynchronously only if each * field is written atomically. * 3. Update dev->stats asynchronously and atomically, and define * neither operation. * * bool (*ndo_has_offload_stats)(const struct net_device *dev, int attr_id) * Return true if this device supports offload stats of this attr_id. * * int (*ndo_get_offload_stats)(int attr_id, const struct net_device *dev, * void *attr_data) * Get statistics for offload operations by attr_id. Write it into the * attr_data pointer. * * int (*ndo_vlan_rx_add_vid)(struct net_device *dev, __be16 proto, u16 vid); * If device supports VLAN filtering this function is called when a * VLAN id is registered. * * int (*ndo_vlan_rx_kill_vid)(struct net_device *dev, __be16 proto, u16 vid); * If device supports VLAN filtering this function is called when a * VLAN id is unregistered. * * void (*ndo_poll_controller)(struct net_device *dev); * * SR-IOV management functions. * int (*ndo_set_vf_mac)(struct net_device *dev, int vf, u8* mac); * int (*ndo_set_vf_vlan)(struct net_device *dev, int vf, u16 vlan, * u8 qos, __be16 proto); * int (*ndo_set_vf_rate)(struct net_device *dev, int vf, int min_tx_rate, * int max_tx_rate); * int (*ndo_set_vf_spoofchk)(struct net_device *dev, int vf, bool setting); * int (*ndo_set_vf_trust)(struct net_device *dev, int vf, bool setting); * int (*ndo_get_vf_config)(struct net_device *dev, * int vf, struct ifla_vf_info *ivf); * int (*ndo_set_vf_link_state)(struct net_device *dev, int vf, int link_state); * int (*ndo_set_vf_port)(struct net_device *dev, int vf, * struct nlattr *port[]); * * Enable or disable the VF ability to query its RSS Redirection Table and * Hash Key. This is needed since on some devices VF share this information * with PF and querying it may introduce a theoretical security risk. * int (*ndo_set_vf_rss_query_en)(struct net_device *dev, int vf, bool setting); * int (*ndo_get_vf_port)(struct net_device *dev, int vf, struct sk_buff *skb); * int (*ndo_setup_tc)(struct net_device *dev, enum tc_setup_type type, * void *type_data); * Called to setup any 'tc' scheduler, classifier or action on @dev. * This is always called from the stack with the rtnl lock held and netif * tx queues stopped. This allows the netdevice to perform queue * management safely. * * Fiber Channel over Ethernet (FCoE) offload functions. * int (*ndo_fcoe_enable)(struct net_device *dev); * Called when the FCoE protocol stack wants to start using LLD for FCoE * so the underlying device can perform whatever needed configuration or * initialization to support acceleration of FCoE traffic. * * int (*ndo_fcoe_disable)(struct net_device *dev); * Called when the FCoE protocol stack wants to stop using LLD for FCoE * so the underlying device can perform whatever needed clean-ups to * stop supporting acceleration of FCoE traffic. * * int (*ndo_fcoe_ddp_setup)(struct net_device *dev, u16 xid, * struct scatterlist *sgl, unsigned int sgc); * Called when the FCoE Initiator wants to initialize an I/O that * is a possible candidate for Direct Data Placement (DDP). The LLD can * perform necessary setup and returns 1 to indicate the device is set up * successfully to perform DDP on this I/O, otherwise this returns 0. * * int (*ndo_fcoe_ddp_done)(struct net_device *dev, u16 xid); * Called when the FCoE Initiator/Target is done with the DDPed I/O as * indicated by the FC exchange id 'xid', so the underlying device can * clean up and reuse resources for later DDP requests. * * int (*ndo_fcoe_ddp_target)(struct net_device *dev, u16 xid, * struct scatterlist *sgl, unsigned int sgc); * Called when the FCoE Target wants to initialize an I/O that * is a possible candidate for Direct Data Placement (DDP). The LLD can * perform necessary setup and returns 1 to indicate the device is set up * successfully to perform DDP on this I/O, otherwise this returns 0. * * int (*ndo_fcoe_get_hbainfo)(struct net_device *dev, * struct netdev_fcoe_hbainfo *hbainfo); * Called when the FCoE Protocol stack wants information on the underlying * device. This information is utilized by the FCoE protocol stack to * register attributes with Fiber Channel management service as per the * FC-GS Fabric Device Management Information(FDMI) specification. * * int (*ndo_fcoe_get_wwn)(struct net_device *dev, u64 *wwn, int type); * Called when the underlying device wants to override default World Wide * Name (WWN) generation mechanism in FCoE protocol stack to pass its own * World Wide Port Name (WWPN) or World Wide Node Name (WWNN) to the FCoE * protocol stack to use. * * RFS acceleration. * int (*ndo_rx_flow_steer)(struct net_device *dev, const struct sk_buff *skb, * u16 rxq_index, u32 flow_id); * Set hardware filter for RFS. rxq_index is the target queue index; * flow_id is a flow ID to be passed to rps_may_expire_flow() later. * Return the filter ID on success, or a negative error code. * * Slave management functions (for bridge, bonding, etc). * int (*ndo_add_slave)(struct net_device *dev, struct net_device *slave_dev); * Called to make another netdev an underling. * * int (*ndo_del_slave)(struct net_device *dev, struct net_device *slave_dev); * Called to release previously enslaved netdev. * * struct net_device *(*ndo_get_xmit_slave)(struct net_device *dev, * struct sk_buff *skb, * bool all_slaves); * Get the xmit slave of master device. If all_slaves is true, function * assume all the slaves can transmit. * * Feature/offload setting functions. * netdev_features_t (*ndo_fix_features)(struct net_device *dev, * netdev_features_t features); * Adjusts the requested feature flags according to device-specific * constraints, and returns the resulting flags. Must not modify * the device state. * * int (*ndo_set_features)(struct net_device *dev, netdev_features_t features); * Called to update device configuration to new features. Passed * feature set might be less than what was returned by ndo_fix_features()). * Must return >0 or -errno if it changed dev->features itself. * * int (*ndo_fdb_add)(struct ndmsg *ndm, struct nlattr *tb[], * struct net_device *dev, * const unsigned char *addr, u16 vid, u16 flags, * bool *notified, struct netlink_ext_ack *extack); * Adds an FDB entry to dev for addr. * Callee shall set *notified to true if it sent any appropriate * notification(s). Otherwise core will send a generic one. * int (*ndo_fdb_del)(struct ndmsg *ndm, struct nlattr *tb[], * struct net_device *dev, * const unsigned char *addr, u16 vid * bool *notified, struct netlink_ext_ack *extack); * Deletes the FDB entry from dev corresponding to addr. * Callee shall set *notified to true if it sent any appropriate * notification(s). Otherwise core will send a generic one. * int (*ndo_fdb_del_bulk)(struct nlmsghdr *nlh, struct net_device *dev, * struct netlink_ext_ack *extack); * int (*ndo_fdb_dump)(struct sk_buff *skb, struct netlink_callback *cb, * struct net_device *dev, struct net_device *filter_dev, * int *idx) * Used to add FDB entries to dump requests. Implementers should add * entries to skb and update idx with the number of entries. * * int (*ndo_mdb_add)(struct net_device *dev, struct nlattr *tb[], * u16 nlmsg_flags, struct netlink_ext_ack *extack); * Adds an MDB entry to dev. * int (*ndo_mdb_del)(struct net_device *dev, struct nlattr *tb[], * struct netlink_ext_ack *extack); * Deletes the MDB entry from dev. * int (*ndo_mdb_del_bulk)(struct net_device *dev, struct nlattr *tb[], * struct netlink_ext_ack *extack); * Bulk deletes MDB entries from dev. * int (*ndo_mdb_dump)(struct net_device *dev, struct sk_buff *skb, * struct netlink_callback *cb); * Dumps MDB entries from dev. The first argument (marker) in the netlink * callback is used by core rtnetlink code. * * int (*ndo_bridge_setlink)(struct net_device *dev, struct nlmsghdr *nlh, * u16 flags, struct netlink_ext_ack *extack) * int (*ndo_bridge_getlink)(struct sk_buff *skb, u32 pid, u32 seq, * struct net_device *dev, u32 filter_mask, * int nlflags) * int (*ndo_bridge_dellink)(struct net_device *dev, struct nlmsghdr *nlh, * u16 flags); * * int (*ndo_change_carrier)(struct net_device *dev, bool new_carrier); * Called to change device carrier. Soft-devices (like dummy, team, etc) * which do not represent real hardware may define this to allow their * userspace components to manage their virtual carrier state. Devices * that determine carrier state from physical hardware properties (eg * network cables) or protocol-dependent mechanisms (eg * USB_CDC_NOTIFY_NETWORK_CONNECTION) should NOT implement this function. * * int (*ndo_get_phys_port_id)(struct net_device *dev, * struct netdev_phys_item_id *ppid); * Called to get ID of physical port of this device. If driver does * not implement this, it is assumed that the hw is not able to have * multiple net devices on single physical port. * * int (*ndo_get_port_parent_id)(struct net_device *dev, * struct netdev_phys_item_id *ppid) * Called to get the parent ID of the physical port of this device. * * void* (*ndo_dfwd_add_station)(struct net_device *pdev, * struct net_device *dev) * Called by upper layer devices to accelerate switching or other * station functionality into hardware. 'pdev is the lowerdev * to use for the offload and 'dev' is the net device that will * back the offload. Returns a pointer to the private structure * the upper layer will maintain. * void (*ndo_dfwd_del_station)(struct net_device *pdev, void *priv) * Called by upper layer device to delete the station created * by 'ndo_dfwd_add_station'. 'pdev' is the net device backing * the station and priv is the structure returned by the add * operation. * int (*ndo_set_tx_maxrate)(struct net_device *dev, * int queue_index, u32 maxrate); * Called when a user wants to set a max-rate limitation of specific * TX queue. * int (*ndo_get_iflink)(const struct net_device *dev); * Called to get the iflink value of this device. * int (*ndo_fill_metadata_dst)(struct net_device *dev, struct sk_buff *skb); * This function is used to get egress tunnel information for given skb. * This is useful for retrieving outer tunnel header parameters while * sampling packet. * void (*ndo_set_rx_headroom)(struct net_device *dev, int needed_headroom); * This function is used to specify the headroom that the skb must * consider when allocation skb during packet reception. Setting * appropriate rx headroom value allows avoiding skb head copy on * forward. Setting a negative value resets the rx headroom to the * default value. * int (*ndo_bpf)(struct net_device *dev, struct netdev_bpf *bpf); * This function is used to set or query state related to XDP on the * netdevice and manage BPF offload. See definition of * enum bpf_netdev_command for details. * int (*ndo_xdp_xmit)(struct net_device *dev, int n, struct xdp_frame **xdp, * u32 flags); * This function is used to submit @n XDP packets for transmit on a * netdevice. Returns number of frames successfully transmitted, frames * that got dropped are freed/returned via xdp_return_frame(). * Returns negative number, means general error invoking ndo, meaning * no frames were xmit'ed and core-caller will free all frames. * struct net_device *(*ndo_xdp_get_xmit_slave)(struct net_device *dev, * struct xdp_buff *xdp); * Get the xmit slave of master device based on the xdp_buff. * int (*ndo_xsk_wakeup)(struct net_device *dev, u32 queue_id, u32 flags); * This function is used to wake up the softirq, ksoftirqd or kthread * responsible for sending and/or receiving packets on a specific * queue id bound to an AF_XDP socket. The flags field specifies if * only RX, only Tx, or both should be woken up using the flags * XDP_WAKEUP_RX and XDP_WAKEUP_TX. * int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm_kern *p, * int cmd); * Add, change, delete or get information on an IPv4 tunnel. * struct net_device *(*ndo_get_peer_dev)(struct net_device *dev); * If a device is paired with a peer device, return the peer instance. * The caller must be under RCU read context. * int (*ndo_fill_forward_path)(struct net_device_path_ctx *ctx, struct net_device_path *path); * Get the forwarding path to reach the real device from the HW destination address * ktime_t (*ndo_get_tstamp)(struct net_device *dev, * const struct skb_shared_hwtstamps *hwtstamps, * bool cycles); * Get hardware timestamp based on normal/adjustable time or free running * cycle counter. This function is required if physical clock supports a * free running cycle counter. * * int (*ndo_hwtstamp_get)(struct net_device *dev, * struct kernel_hwtstamp_config *kernel_config); * Get the currently configured hardware timestamping parameters for the * NIC device. * * int (*ndo_hwtstamp_set)(struct net_device *dev, * struct kernel_hwtstamp_config *kernel_config, * struct netlink_ext_ack *extack); * Change the hardware timestamping parameters for NIC device. */ struct net_device_ops { int (*ndo_init)(struct net_device *dev); void (*ndo_uninit)(struct net_device *dev); int (*ndo_open)(struct net_device *dev); int (*ndo_stop)(struct net_device *dev); netdev_tx_t (*ndo_start_xmit)(struct sk_buff *skb, struct net_device *dev); netdev_features_t (*ndo_features_check)(struct sk_buff *skb, struct net_device *dev, netdev_features_t features); u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev); void (*ndo_change_rx_flags)(struct net_device *dev, int flags); void (*ndo_set_rx_mode)(struct net_device *dev); int (*ndo_set_mac_address)(struct net_device *dev, void *addr); int (*ndo_validate_addr)(struct net_device *dev); int (*ndo_do_ioctl)(struct net_device *dev, struct ifreq *ifr, int cmd); int (*ndo_eth_ioctl)(struct net_device *dev, struct ifreq *ifr, int cmd); int (*ndo_siocbond)(struct net_device *dev, struct ifreq *ifr, int cmd); int (*ndo_siocwandev)(struct net_device *dev, struct if_settings *ifs); int (*ndo_siocdevprivate)(struct net_device *dev, struct ifreq *ifr, void __user *data, int cmd); int (*ndo_set_config)(struct net_device *dev, struct ifmap *map); int (*ndo_change_mtu)(struct net_device *dev, int new_mtu); int (*ndo_neigh_setup)(struct net_device *dev, struct neigh_parms *); void (*ndo_tx_timeout) (struct net_device *dev, unsigned int txqueue); void (*ndo_get_stats64)(struct net_device *dev, struct rtnl_link_stats64 *storage); bool (*ndo_has_offload_stats)(const struct net_device *dev, int attr_id); int (*ndo_get_offload_stats)(int attr_id, const struct net_device *dev, void *attr_data); struct net_device_stats* (*ndo_get_stats)(struct net_device *dev); int (*ndo_vlan_rx_add_vid)(struct net_device *dev, __be16 proto, u16 vid); int (*ndo_vlan_rx_kill_vid)(struct net_device *dev, __be16 proto, u16 vid); #ifdef CONFIG_NET_POLL_CONTROLLER void (*ndo_poll_controller)(struct net_device *dev); int (*ndo_netpoll_setup)(struct net_device *dev); void (*ndo_netpoll_cleanup)(struct net_device *dev); #endif int (*ndo_set_vf_mac)(struct net_device *dev, int queue, u8 *mac); int (*ndo_set_vf_vlan)(struct net_device *dev, int queue, u16 vlan, u8 qos, __be16 proto); int (*ndo_set_vf_rate)(struct net_device *dev, int vf, int min_tx_rate, int max_tx_rate); int (*ndo_set_vf_spoofchk)(struct net_device *dev, int vf, bool setting); int (*ndo_set_vf_trust)(struct net_device *dev, int vf, bool setting); int (*ndo_get_vf_config)(struct net_device *dev, int vf, struct ifla_vf_info *ivf); int (*ndo_set_vf_link_state)(struct net_device *dev, int vf, int link_state); int (*ndo_get_vf_stats)(struct net_device *dev, int vf, struct ifla_vf_stats *vf_stats); int (*ndo_set_vf_port)(struct net_device *dev, int vf, struct nlattr *port[]); int (*ndo_get_vf_port)(struct net_device *dev, int vf, struct sk_buff *skb); int (*ndo_get_vf_guid)(struct net_device *dev, int vf, struct ifla_vf_guid *node_guid, struct ifla_vf_guid *port_guid); int (*ndo_set_vf_guid)(struct net_device *dev, int vf, u64 guid, int guid_type); int (*ndo_set_vf_rss_query_en)( struct net_device *dev, int vf, bool setting); int (*ndo_setup_tc)(struct net_device *dev, enum tc_setup_type type, void *type_data); #if IS_ENABLED(CONFIG_FCOE) int (*ndo_fcoe_enable)(struct net_device *dev); int (*ndo_fcoe_disable)(struct net_device *dev); int (*ndo_fcoe_ddp_setup)(struct net_device *dev, u16 xid, struct scatterlist *sgl, unsigned int sgc); int (*ndo_fcoe_ddp_done)(struct net_device *dev, u16 xid); int (*ndo_fcoe_ddp_target)(struct net_device *dev, u16 xid, struct scatterlist *sgl, unsigned int sgc); int (*ndo_fcoe_get_hbainfo)(struct net_device *dev, struct netdev_fcoe_hbainfo *hbainfo); #endif #if IS_ENABLED(CONFIG_LIBFCOE) #define NETDEV_FCOE_WWNN 0 #define NETDEV_FCOE_WWPN 1 int (*ndo_fcoe_get_wwn)(struct net_device *dev, u64 *wwn, int type); #endif #ifdef CONFIG_RFS_ACCEL int (*ndo_rx_flow_steer)(struct net_device *dev, const struct sk_buff *skb, u16 rxq_index, u32 flow_id); #endif int (*ndo_add_slave)(struct net_device *dev, struct net_device *slave_dev, struct netlink_ext_ack *extack); int (*ndo_del_slave)(struct net_device *dev, struct net_device *slave_dev); struct net_device* (*ndo_get_xmit_slave)(struct net_device *dev, struct sk_buff *skb, bool all_slaves); struct net_device* (*ndo_sk_get_lower_dev)(struct net_device *dev, struct sock *sk); netdev_features_t (*ndo_fix_features)(struct net_device *dev, netdev_features_t features); int (*ndo_set_features)(struct net_device *dev, netdev_features_t features); int (*ndo_neigh_construct)(struct net_device *dev, struct neighbour *n); void (*ndo_neigh_destroy)(struct net_device *dev, struct neighbour *n); int (*ndo_fdb_add)(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u16 flags, bool *notified, struct netlink_ext_ack *extack); int (*ndo_fdb_del)(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, bool *notified, struct netlink_ext_ack *extack); int (*ndo_fdb_del_bulk)(struct nlmsghdr *nlh, struct net_device *dev, struct netlink_ext_ack *extack); int (*ndo_fdb_dump)(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, struct net_device *filter_dev, int *idx); int (*ndo_fdb_get)(struct sk_buff *skb, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u32 portid, u32 seq, struct netlink_ext_ack *extack); int (*ndo_mdb_add)(struct net_device *dev, struct nlattr *tb[], u16 nlmsg_flags, struct netlink_ext_ack *extack); int (*ndo_mdb_del)(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack); int (*ndo_mdb_del_bulk)(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack); int (*ndo_mdb_dump)(struct net_device *dev, struct sk_buff *skb, struct netlink_callback *cb); int (*ndo_mdb_get)(struct net_device *dev, struct nlattr *tb[], u32 portid, u32 seq, struct netlink_ext_ack *extack); int (*ndo_bridge_setlink)(struct net_device *dev, struct nlmsghdr *nlh, u16 flags, struct netlink_ext_ack *extack); int (*ndo_bridge_getlink)(struct sk_buff *skb, u32 pid, u32 seq, struct net_device *dev, u32 filter_mask, int nlflags); int (*ndo_bridge_dellink)(struct net_device *dev, struct nlmsghdr *nlh, u16 flags); int (*ndo_change_carrier)(struct net_device *dev, bool new_carrier); int (*ndo_get_phys_port_id)(struct net_device *dev, struct netdev_phys_item_id *ppid); int (*ndo_get_port_parent_id)(struct net_device *dev, struct netdev_phys_item_id *ppid); int (*ndo_get_phys_port_name)(struct net_device *dev, char *name, size_t len); void* (*ndo_dfwd_add_station)(struct net_device *pdev, struct net_device *dev); void (*ndo_dfwd_del_station)(struct net_device *pdev, void *priv); int (*ndo_set_tx_maxrate)(struct net_device *dev, int queue_index, u32 maxrate); int (*ndo_get_iflink)(const struct net_device *dev); int (*ndo_fill_metadata_dst)(struct net_device *dev, struct sk_buff *skb); void (*ndo_set_rx_headroom)(struct net_device *dev, int needed_headroom); int (*ndo_bpf)(struct net_device *dev, struct netdev_bpf *bpf); int (*ndo_xdp_xmit)(struct net_device *dev, int n, struct xdp_frame **xdp, u32 flags); struct net_device * (*ndo_xdp_get_xmit_slave)(struct net_device *dev, struct xdp_buff *xdp); int (*ndo_xsk_wakeup)(struct net_device *dev, u32 queue_id, u32 flags); int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm_kern *p, int cmd); struct net_device * (*ndo_get_peer_dev)(struct net_device *dev); int (*ndo_fill_forward_path)(struct net_device_path_ctx *ctx, struct net_device_path *path); ktime_t (*ndo_get_tstamp)(struct net_device *dev, const struct skb_shared_hwtstamps *hwtstamps, bool cycles); int (*ndo_hwtstamp_get)(struct net_device *dev, struct kernel_hwtstamp_config *kernel_config); int (*ndo_hwtstamp_set)(struct net_device *dev, struct kernel_hwtstamp_config *kernel_config, struct netlink_ext_ack *extack); #if IS_ENABLED(CONFIG_NET_SHAPER) /** * @net_shaper_ops: Device shaping offload operations * see include/net/net_shapers.h */ const struct net_shaper_ops *net_shaper_ops; #endif }; /** * enum netdev_priv_flags - &struct net_device priv_flags * * These are the &struct net_device, they are only set internally * by drivers and used in the kernel. These flags are invisible to * userspace; this means that the order of these flags can change * during any kernel release. * * You should add bitfield booleans after either net_device::priv_flags * (hotpath) or ::threaded (slowpath) instead of extending these flags. * * @IFF_802_1Q_VLAN: 802.1Q VLAN device * @IFF_EBRIDGE: Ethernet bridging device * @IFF_BONDING: bonding master or slave * @IFF_ISATAP: ISATAP interface (RFC4214) * @IFF_WAN_HDLC: WAN HDLC device * @IFF_XMIT_DST_RELEASE: dev_hard_start_xmit() is allowed to * release skb->dst * @IFF_DONT_BRIDGE: disallow bridging this ether dev * @IFF_DISABLE_NETPOLL: disable netpoll at run-time * @IFF_MACVLAN_PORT: device used as macvlan port * @IFF_BRIDGE_PORT: device used as bridge port * @IFF_OVS_DATAPATH: device used as Open vSwitch datapath port * @IFF_TX_SKB_SHARING: The interface supports sharing skbs on transmit * @IFF_UNICAST_FLT: Supports unicast filtering * @IFF_TEAM_PORT: device used as team port * @IFF_SUPP_NOFCS: device supports sending custom FCS * @IFF_LIVE_ADDR_CHANGE: device supports hardware address * change when it's running * @IFF_MACVLAN: Macvlan device * @IFF_XMIT_DST_RELEASE_PERM: IFF_XMIT_DST_RELEASE not taking into account * underlying stacked devices * @IFF_L3MDEV_MASTER: device is an L3 master device * @IFF_NO_QUEUE: device can run without qdisc attached * @IFF_OPENVSWITCH: device is a Open vSwitch master * @IFF_L3MDEV_SLAVE: device is enslaved to an L3 master device * @IFF_TEAM: device is a team device * @IFF_RXFH_CONFIGURED: device has had Rx Flow indirection table configured * @IFF_PHONY_HEADROOM: the headroom value is controlled by an external * entity (i.e. the master device for bridged veth) * @IFF_MACSEC: device is a MACsec device * @IFF_NO_RX_HANDLER: device doesn't support the rx_handler hook * @IFF_FAILOVER: device is a failover master device * @IFF_FAILOVER_SLAVE: device is lower dev of a failover master device * @IFF_L3MDEV_RX_HANDLER: only invoke the rx handler of L3 master device * @IFF_NO_ADDRCONF: prevent ipv6 addrconf * @IFF_TX_SKB_NO_LINEAR: device/driver is capable of xmitting frames with * skb_headlen(skb) == 0 (data starts from frag0) */ enum netdev_priv_flags { IFF_802_1Q_VLAN = 1<<0, IFF_EBRIDGE = 1<<1, IFF_BONDING = 1<<2, IFF_ISATAP = 1<<3, IFF_WAN_HDLC = 1<<4, IFF_XMIT_DST_RELEASE = 1<<5, IFF_DONT_BRIDGE = 1<<6, IFF_DISABLE_NETPOLL = 1<<7, IFF_MACVLAN_PORT = 1<<8, IFF_BRIDGE_PORT = 1<<9, IFF_OVS_DATAPATH = 1<<10, IFF_TX_SKB_SHARING = 1<<11, IFF_UNICAST_FLT = 1<<12, IFF_TEAM_PORT = 1<<13, IFF_SUPP_NOFCS = 1<<14, IFF_LIVE_ADDR_CHANGE = 1<<15, IFF_MACVLAN = 1<<16, IFF_XMIT_DST_RELEASE_PERM = 1<<17, IFF_L3MDEV_MASTER = 1<<18, IFF_NO_QUEUE = 1<<19, IFF_OPENVSWITCH = 1<<20, IFF_L3MDEV_SLAVE = 1<<21, IFF_TEAM = 1<<22, IFF_RXFH_CONFIGURED = 1<<23, IFF_PHONY_HEADROOM = 1<<24, IFF_MACSEC = 1<<25, IFF_NO_RX_HANDLER = 1<<26, IFF_FAILOVER = 1<<27, IFF_FAILOVER_SLAVE = 1<<28, IFF_L3MDEV_RX_HANDLER = 1<<29, IFF_NO_ADDRCONF = BIT_ULL(30), IFF_TX_SKB_NO_LINEAR = BIT_ULL(31), }; /* Specifies the type of the struct net_device::ml_priv pointer */ enum netdev_ml_priv_type { ML_PRIV_NONE, ML_PRIV_CAN, }; enum netdev_stat_type { NETDEV_PCPU_STAT_NONE, NETDEV_PCPU_STAT_LSTATS, /* struct pcpu_lstats */ NETDEV_PCPU_STAT_TSTATS, /* struct pcpu_sw_netstats */ NETDEV_PCPU_STAT_DSTATS, /* struct pcpu_dstats */ }; enum netdev_reg_state { NETREG_UNINITIALIZED = 0, NETREG_REGISTERED, /* completed register_netdevice */ NETREG_UNREGISTERING, /* called unregister_netdevice */ NETREG_UNREGISTERED, /* completed unregister todo */ NETREG_RELEASED, /* called free_netdev */ NETREG_DUMMY, /* dummy device for NAPI poll */ }; /** * struct net_device - The DEVICE structure. * * Actually, this whole structure is a big mistake. It mixes I/O * data with strictly "high-level" data, and it has to know about * almost every data structure used in the INET module. * * @priv_flags: flags invisible to userspace defined as bits, see * enum netdev_priv_flags for the definitions * @lltx: device supports lockless Tx. Deprecated for real HW * drivers. Mainly used by logical interfaces, such as * bonding and tunnels * @netmem_tx: device support netmem_tx. * * @name: This is the first field of the "visible" part of this structure * (i.e. as seen by users in the "Space.c" file). It is the name * of the interface. * * @name_node: Name hashlist node * @ifalias: SNMP alias * @mem_end: Shared memory end * @mem_start: Shared memory start * @base_addr: Device I/O address * @irq: Device IRQ number * * @state: Generic network queuing layer state, see netdev_state_t * @dev_list: The global list of network devices * @napi_list: List entry used for polling NAPI devices * @unreg_list: List entry when we are unregistering the * device; see the function unregister_netdev * @close_list: List entry used when we are closing the device * @ptype_all: Device-specific packet handlers for all protocols * @ptype_specific: Device-specific, protocol-specific packet handlers * * @adj_list: Directly linked devices, like slaves for bonding * @features: Currently active device features * @hw_features: User-changeable features * * @wanted_features: User-requested features * @vlan_features: Mask of features inheritable by VLAN devices * * @hw_enc_features: Mask of features inherited by encapsulating devices * This field indicates what encapsulation * offloads the hardware is capable of doing, * and drivers will need to set them appropriately. * * @mpls_features: Mask of features inheritable by MPLS * @gso_partial_features: value(s) from NETIF_F_GSO\* * * @ifindex: interface index * @group: The group the device belongs to * * @stats: Statistics struct, which was left as a legacy, use * rtnl_link_stats64 instead * * @core_stats: core networking counters, * do not use this in drivers * @carrier_up_count: Number of times the carrier has been up * @carrier_down_count: Number of times the carrier has been down * * @wireless_handlers: List of functions to handle Wireless Extensions, * instead of ioctl, * see <net/iw_handler.h> for details. * * @netdev_ops: Includes several pointers to callbacks, * if one wants to override the ndo_*() functions * @xdp_metadata_ops: Includes pointers to XDP metadata callbacks. * @xsk_tx_metadata_ops: Includes pointers to AF_XDP TX metadata callbacks. * @ethtool_ops: Management operations * @l3mdev_ops: Layer 3 master device operations * @ndisc_ops: Includes callbacks for different IPv6 neighbour * discovery handling. Necessary for e.g. 6LoWPAN. * @xfrmdev_ops: Transformation offload operations * @tlsdev_ops: Transport Layer Security offload operations * @header_ops: Includes callbacks for creating,parsing,caching,etc * of Layer 2 headers. * * @flags: Interface flags (a la BSD) * @xdp_features: XDP capability supported by the device * @gflags: Global flags ( kept as legacy ) * @priv_len: Size of the ->priv flexible array * @priv: Flexible array containing private data * @operstate: RFC2863 operstate * @link_mode: Mapping policy to operstate * @if_port: Selectable AUI, TP, ... * @dma: DMA channel * @mtu: Interface MTU value * @min_mtu: Interface Minimum MTU value * @max_mtu: Interface Maximum MTU value * @type: Interface hardware type * @hard_header_len: Maximum hardware header length. * @min_header_len: Minimum hardware header length * * @needed_headroom: Extra headroom the hardware may need, but not in all * cases can this be guaranteed * @needed_tailroom: Extra tailroom the hardware may need, but not in all * cases can this be guaranteed. Some cases also use * LL_MAX_HEADER instead to allocate the skb * * interface address info: * * @perm_addr: Permanent hw address * @addr_assign_type: Hw address assignment type * @addr_len: Hardware address length * @upper_level: Maximum depth level of upper devices. * @lower_level: Maximum depth level of lower devices. * @threaded: napi threaded state. * @neigh_priv_len: Used in neigh_alloc() * @dev_id: Used to differentiate devices that share * the same link layer address * @dev_port: Used to differentiate devices that share * the same function * @addr_list_lock: XXX: need comments on this one * @name_assign_type: network interface name assignment type * @uc_promisc: Counter that indicates promiscuous mode * has been enabled due to the need to listen to * additional unicast addresses in a device that * does not implement ndo_set_rx_mode() * @uc: unicast mac addresses * @mc: multicast mac addresses * @dev_addrs: list of device hw addresses * @queues_kset: Group of all Kobjects in the Tx and RX queues * @promiscuity: Number of times the NIC is told to work in * promiscuous mode; if it becomes 0 the NIC will * exit promiscuous mode * @allmulti: Counter, enables or disables allmulticast mode * * @vlan_info: VLAN info * @dsa_ptr: dsa specific data * @tipc_ptr: TIPC specific data * @atalk_ptr: AppleTalk link * @ip_ptr: IPv4 specific data * @ip6_ptr: IPv6 specific data * @ax25_ptr: AX.25 specific data * @ieee80211_ptr: IEEE 802.11 specific data, assign before registering * @ieee802154_ptr: IEEE 802.15.4 low-rate Wireless Personal Area Network * device struct * @mpls_ptr: mpls_dev struct pointer * @mctp_ptr: MCTP specific data * @psp_dev: PSP crypto device registered for this netdev * * @dev_addr: Hw address (before bcast, * because most packets are unicast) * * @_rx: Array of RX queues * @num_rx_queues: Number of RX queues * allocated at register_netdev() time * @real_num_rx_queues: Number of RX queues currently active in device * @xdp_prog: XDP sockets filter program pointer * * @rx_handler: handler for received packets * @rx_handler_data: XXX: need comments on this one * @tcx_ingress: BPF & clsact qdisc specific data for ingress processing * @ingress_queue: XXX: need comments on this one * @nf_hooks_ingress: netfilter hooks executed for ingress packets * @broadcast: hw bcast address * * @rx_cpu_rmap: CPU reverse-mapping for RX completion interrupts, * indexed by RX queue number. Assigned by driver. * This must only be set if the ndo_rx_flow_steer * operation is defined * @index_hlist: Device index hash chain * * @_tx: Array of TX queues * @num_tx_queues: Number of TX queues allocated at alloc_netdev_mq() time * @real_num_tx_queues: Number of TX queues currently active in device * @qdisc: Root qdisc from userspace point of view * @tx_queue_len: Max frames per queue allowed * @tx_global_lock: XXX: need comments on this one * @xdp_bulkq: XDP device bulk queue * @xps_maps: all CPUs/RXQs maps for XPS device * * @xps_maps: XXX: need comments on this one * @tcx_egress: BPF & clsact qdisc specific data for egress processing * @nf_hooks_egress: netfilter hooks executed for egress packets * @qdisc_hash: qdisc hash table * @watchdog_timeo: Represents the timeout that is used by * the watchdog (see dev_watchdog()) * @watchdog_timer: List of timers * * @proto_down_reason: reason a netdev interface is held down * @pcpu_refcnt: Number of references to this device * @dev_refcnt: Number of references to this device * @refcnt_tracker: Tracker directory for tracked references to this device * @todo_list: Delayed register/unregister * @link_watch_list: XXX: need comments on this one * * @reg_state: Register/unregister state machine * @dismantle: Device is going to be freed * @needs_free_netdev: Should unregister perform free_netdev? * @priv_destructor: Called from unregister * @npinfo: XXX: need comments on this one * @nd_net: Network namespace this network device is inside * protected by @lock * * @ml_priv: Mid-layer private * @ml_priv_type: Mid-layer private type * * @pcpu_stat_type: Type of device statistics which the core should * allocate/free: none, lstats, tstats, dstats. none * means the driver is handling statistics allocation/ * freeing internally. * @lstats: Loopback statistics: packets, bytes * @tstats: Tunnel statistics: RX/TX packets, RX/TX bytes * @dstats: Dummy statistics: RX/TX/drop packets, RX/TX bytes * * @garp_port: GARP * @mrp_port: MRP * * @dm_private: Drop monitor private * * @dev: Class/net/name entry * @sysfs_groups: Space for optional device, statistics and wireless * sysfs groups * * @sysfs_rx_queue_group: Space for optional per-rx queue attributes * @rtnl_link_ops: Rtnl_link_ops * @stat_ops: Optional ops for queue-aware statistics * @queue_mgmt_ops: Optional ops for queue management * * @gso_max_size: Maximum size of generic segmentation offload * @tso_max_size: Device (as in HW) limit on the max TSO request size * @gso_max_segs: Maximum number of segments that can be passed to the * NIC for GSO * @tso_max_segs: Device (as in HW) limit on the max TSO segment count * @gso_ipv4_max_size: Maximum size of generic segmentation offload, * for IPv4. * * @dcbnl_ops: Data Center Bridging netlink ops * @num_tc: Number of traffic classes in the net device * @tc_to_txq: XXX: need comments on this one * @prio_tc_map: XXX: need comments on this one * * @fcoe_ddp_xid: Max exchange id for FCoE LRO by ddp * * @priomap: XXX: need comments on this one * @link_topo: Physical link topology tracking attached PHYs * @phydev: Physical device may attach itself * for hardware timestamping * @sfp_bus: attached &struct sfp_bus structure. * * @qdisc_tx_busylock: lockdep class annotating Qdisc->busylock spinlock * * @proto_down: protocol port state information can be sent to the * switch driver and used to set the phys state of the * switch port. * * @irq_affinity_auto: driver wants the core to store and re-assign the IRQ * affinity. Set by netif_enable_irq_affinity(), then * the driver must create a persistent napi by * netif_napi_add_config() and finally bind the napi to * IRQ (via netif_napi_set_irq()). * * @rx_cpu_rmap_auto: driver wants the core to manage the ARFS rmap. * Set by calling netif_enable_cpu_rmap(). * * @see_all_hwtstamp_requests: device wants to see calls to * ndo_hwtstamp_set() for all timestamp requests * regardless of source, even if those aren't * HWTSTAMP_SOURCE_NETDEV * @change_proto_down: device supports setting carrier via IFLA_PROTO_DOWN * @netns_immutable: interface can't change network namespaces * @fcoe_mtu: device supports maximum FCoE MTU, 2158 bytes * * @net_notifier_list: List of per-net netdev notifier block * that follow this device when it is moved * to another network namespace. * * @macsec_ops: MACsec offloading ops * * @udp_tunnel_nic_info: static structure describing the UDP tunnel * offload capabilities of the device * @udp_tunnel_nic: UDP tunnel offload state * @ethtool: ethtool related state * @xdp_state: stores info on attached XDP BPF programs * * @nested_level: Used as a parameter of spin_lock_nested() of * dev->addr_list_lock. * @unlink_list: As netif_addr_lock() can be called recursively, * keep a list of interfaces to be deleted. * @gro_max_size: Maximum size of aggregated packet in generic * receive offload (GRO) * @gro_ipv4_max_size: Maximum size of aggregated packet in generic * receive offload (GRO), for IPv4. * @xdp_zc_max_segs: Maximum number of segments supported by AF_XDP * zero copy driver * * @dev_addr_shadow: Copy of @dev_addr to catch direct writes. * @linkwatch_dev_tracker: refcount tracker used by linkwatch. * @watchdog_dev_tracker: refcount tracker used by watchdog. * @dev_registered_tracker: tracker for reference held while * registered * @offload_xstats_l3: L3 HW stats for this netdevice. * * @devlink_port: Pointer to related devlink port structure. * Assigned by a driver before netdev registration using * SET_NETDEV_DEVLINK_PORT macro. This pointer is static * during the time netdevice is registered. * * @dpll_pin: Pointer to the SyncE source pin of a DPLL subsystem, * where the clock is recovered. * * @max_pacing_offload_horizon: max EDT offload horizon in nsec. * @napi_config: An array of napi_config structures containing per-NAPI * settings. * @num_napi_configs: number of allocated NAPI config structs, * always >= max(num_rx_queues, num_tx_queues). * @gro_flush_timeout: timeout for GRO layer in NAPI * @napi_defer_hard_irqs: If not zero, provides a counter that would * allow to avoid NIC hard IRQ, on busy queues. * * @neighbours: List heads pointing to this device's neighbours' * dev_list, one per address-family. * @hwprov: Tracks which PTP performs hardware packet time stamping. * * FIXME: cleanup struct net_device such that network protocol info * moves out. */ struct net_device { /* Cacheline organization can be found documented in * Documentation/networking/net_cachelines/net_device.rst. * Please update the document when adding new fields. */ /* TX read-mostly hotpath */ __cacheline_group_begin(net_device_read_tx); struct_group(priv_flags_fast, unsigned long priv_flags:32; unsigned long lltx:1; unsigned long netmem_tx:1; ); const struct net_device_ops *netdev_ops; const struct header_ops *header_ops; struct netdev_queue *_tx; netdev_features_t gso_partial_features; unsigned int real_num_tx_queues; unsigned int gso_max_size; unsigned int gso_ipv4_max_size; u16 gso_max_segs; s16 num_tc; /* Note : dev->mtu is often read without holding a lock. * Writers usually hold RTNL. * It is recommended to use READ_ONCE() to annotate the reads, * and to use WRITE_ONCE() to annotate the writes. */ unsigned int mtu; unsigned short needed_headroom; struct netdev_tc_txq tc_to_txq[TC_MAX_QUEUE]; #ifdef CONFIG_XPS struct xps_dev_maps __rcu *xps_maps[XPS_MAPS_MAX]; #endif #ifdef CONFIG_NETFILTER_EGRESS struct nf_hook_entries __rcu *nf_hooks_egress; #endif #ifdef CONFIG_NET_XGRESS struct bpf_mprog_entry __rcu *tcx_egress; #endif __cacheline_group_end(net_device_read_tx); /* TXRX read-mostly hotpath */ __cacheline_group_begin(net_device_read_txrx); union { struct pcpu_lstats __percpu *lstats; struct pcpu_sw_netstats __percpu *tstats; struct pcpu_dstats __percpu *dstats; }; unsigned long state; unsigned int flags; unsigned short hard_header_len; netdev_features_t features; struct inet6_dev __rcu *ip6_ptr; __cacheline_group_end(net_device_read_txrx); /* RX read-mostly hotpath */ __cacheline_group_begin(net_device_read_rx); struct bpf_prog __rcu *xdp_prog; struct list_head ptype_specific; int ifindex; unsigned int real_num_rx_queues; struct netdev_rx_queue *_rx; unsigned int gro_max_size; unsigned int gro_ipv4_max_size; rx_handler_func_t __rcu *rx_handler; void __rcu *rx_handler_data; possible_net_t nd_net; #ifdef CONFIG_NETPOLL struct netpoll_info __rcu *npinfo; #endif #ifdef CONFIG_NET_XGRESS struct bpf_mprog_entry __rcu *tcx_ingress; #endif __cacheline_group_end(net_device_read_rx); char name[IFNAMSIZ]; struct netdev_name_node *name_node; struct dev_ifalias __rcu *ifalias; /* * I/O specific fields * FIXME: Merge these and struct ifmap into one */ unsigned long mem_end; unsigned long mem_start; unsigned long base_addr; /* * Some hardware also needs these fields (state,dev_list, * napi_list,unreg_list,close_list) but they are not * part of the usual set specified in Space.c. */ struct list_head dev_list; struct list_head napi_list; struct list_head unreg_list; struct list_head close_list; struct list_head ptype_all; struct { struct list_head upper; struct list_head lower; } adj_list; /* Read-mostly cache-line for fast-path access */ xdp_features_t xdp_features; const struct xdp_metadata_ops *xdp_metadata_ops; const struct xsk_tx_metadata_ops *xsk_tx_metadata_ops; unsigned short gflags; unsigned short needed_tailroom; netdev_features_t hw_features; netdev_features_t wanted_features; netdev_features_t vlan_features; netdev_features_t hw_enc_features; netdev_features_t mpls_features; unsigned int min_mtu; unsigned int max_mtu; unsigned short type; unsigned char min_header_len; unsigned char name_assign_type; int group; struct net_device_stats stats; /* not used by modern drivers */ struct net_device_core_stats __percpu *core_stats; /* Stats to monitor link on/off, flapping */ atomic_t carrier_up_count; atomic_t carrier_down_count; #ifdef CONFIG_WIRELESS_EXT const struct iw_handler_def *wireless_handlers; #endif const struct ethtool_ops *ethtool_ops; #ifdef CONFIG_NET_L3_MASTER_DEV const struct l3mdev_ops *l3mdev_ops; #endif #if IS_ENABLED(CONFIG_IPV6) const struct ndisc_ops *ndisc_ops; #endif #ifdef CONFIG_XFRM_OFFLOAD const struct xfrmdev_ops *xfrmdev_ops; #endif #if IS_ENABLED(CONFIG_TLS_DEVICE) const struct tlsdev_ops *tlsdev_ops; #endif #ifdef CONFIG_NET_DEV_REFCNT_TRACKER struct list_head netdev_trace_buffer_list; #endif unsigned int operstate; unsigned char link_mode; unsigned char if_port; unsigned char dma; /* Interface address info. */ unsigned char perm_addr[MAX_ADDR_LEN]; unsigned char addr_assign_type; unsigned char addr_len; unsigned char upper_level; unsigned char lower_level; u8 threaded; unsigned short neigh_priv_len; unsigned short dev_id; unsigned short dev_port; int irq; u32 priv_len; spinlock_t addr_list_lock; struct netdev_hw_addr_list uc; struct netdev_hw_addr_list mc; struct netdev_hw_addr_list dev_addrs; #ifdef CONFIG_SYSFS struct kset *queues_kset; #endif #ifdef CONFIG_LOCKDEP struct list_head unlink_list; #endif unsigned int promiscuity; unsigned int allmulti; bool uc_promisc; #ifdef CONFIG_LOCKDEP unsigned char nested_level; #endif /* Protocol-specific pointers */ struct in_device __rcu *ip_ptr; /** @fib_nh_head: nexthops associated with this netdev */ struct hlist_head fib_nh_head; #if IS_ENABLED(CONFIG_VLAN_8021Q) struct vlan_info __rcu *vlan_info; #endif #if IS_ENABLED(CONFIG_NET_DSA) struct dsa_port *dsa_ptr; #endif #if IS_ENABLED(CONFIG_TIPC) struct tipc_bearer __rcu *tipc_ptr; #endif #if IS_ENABLED(CONFIG_ATALK) void *atalk_ptr; #endif #if IS_ENABLED(CONFIG_AX25) struct ax25_dev __rcu *ax25_ptr; #endif #if IS_ENABLED(CONFIG_CFG80211) struct wireless_dev *ieee80211_ptr; #endif #if IS_ENABLED(CONFIG_IEEE802154) || IS_ENABLED(CONFIG_6LOWPAN) struct wpan_dev *ieee802154_ptr; #endif #if IS_ENABLED(CONFIG_MPLS_ROUTING) struct mpls_dev __rcu *mpls_ptr; #endif #if IS_ENABLED(CONFIG_MCTP) struct mctp_dev __rcu *mctp_ptr; #endif #if IS_ENABLED(CONFIG_INET_PSP) struct psp_dev __rcu *psp_dev; #endif /* * Cache lines mostly used on receive path (including eth_type_trans()) */ /* Interface address info used in eth_type_trans() */ const unsigned char *dev_addr; unsigned int num_rx_queues; #define GRO_LEGACY_MAX_SIZE 65536u /* TCP minimal MSS is 8 (TCP_MIN_GSO_SIZE), * and shinfo->gso_segs is a 16bit field. */ #define GRO_MAX_SIZE (8 * 65535u) unsigned int xdp_zc_max_segs; struct netdev_queue __rcu *ingress_queue; #ifdef CONFIG_NETFILTER_INGRESS struct nf_hook_entries __rcu *nf_hooks_ingress; #endif unsigned char broadcast[MAX_ADDR_LEN]; #ifdef CONFIG_RFS_ACCEL struct cpu_rmap *rx_cpu_rmap; #endif struct hlist_node index_hlist; /* * Cache lines mostly used on transmit path */ unsigned int num_tx_queues; struct Qdisc __rcu *qdisc; unsigned int tx_queue_len; spinlock_t tx_global_lock; struct xdp_dev_bulk_queue __percpu *xdp_bulkq; #ifdef CONFIG_NET_SCHED DECLARE_HASHTABLE (qdisc_hash, 4); #endif /* These may be needed for future network-power-down code. */ struct timer_list watchdog_timer; int watchdog_timeo; u32 proto_down_reason; struct list_head todo_list; #ifdef CONFIG_PCPU_DEV_REFCNT int __percpu *pcpu_refcnt; #else refcount_t dev_refcnt; #endif struct ref_tracker_dir refcnt_tracker; struct list_head link_watch_list; u8 reg_state; bool dismantle; /** @moving_ns: device is changing netns, protected by @lock */ bool moving_ns; /** @rtnl_link_initializing: Device being created, suppress events */ bool rtnl_link_initializing; bool needs_free_netdev; void (*priv_destructor)(struct net_device *dev); /* mid-layer private */ void *ml_priv; enum netdev_ml_priv_type ml_priv_type; enum netdev_stat_type pcpu_stat_type:8; #if IS_ENABLED(CONFIG_GARP) struct garp_port __rcu *garp_port; #endif #if IS_ENABLED(CONFIG_MRP) struct mrp_port __rcu *mrp_port; #endif #if IS_ENABLED(CONFIG_NET_DROP_MONITOR) struct dm_hw_stat_delta __rcu *dm_private; #endif struct device dev; const struct attribute_group *sysfs_groups[5]; const struct attribute_group *sysfs_rx_queue_group; const struct rtnl_link_ops *rtnl_link_ops; const struct netdev_stat_ops *stat_ops; const struct netdev_queue_mgmt_ops *queue_mgmt_ops; /* for setting kernel sock attribute on TCP connection setup */ #define GSO_MAX_SEGS 65535u #define GSO_LEGACY_MAX_SIZE 65536u /* TCP minimal MSS is 8 (TCP_MIN_GSO_SIZE), * and shinfo->gso_segs is a 16bit field. */ #define GSO_MAX_SIZE (8 * GSO_MAX_SEGS) #define TSO_LEGACY_MAX_SIZE 65536 #define TSO_MAX_SIZE UINT_MAX unsigned int tso_max_size; #define TSO_MAX_SEGS U16_MAX u16 tso_max_segs; #ifdef CONFIG_DCB const struct dcbnl_rtnl_ops *dcbnl_ops; #endif u8 prio_tc_map[TC_BITMASK + 1]; #if IS_ENABLED(CONFIG_FCOE) unsigned int fcoe_ddp_xid; #endif #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) struct netprio_map __rcu *priomap; #endif struct phy_link_topology *link_topo; struct phy_device *phydev; struct sfp_bus *sfp_bus; struct lock_class_key *qdisc_tx_busylock; bool proto_down; bool irq_affinity_auto; bool rx_cpu_rmap_auto; /* priv_flags_slow, ungrouped to save space */ unsigned long see_all_hwtstamp_requests:1; unsigned long change_proto_down:1; unsigned long netns_immutable:1; unsigned long fcoe_mtu:1; struct list_head net_notifier_list; #if IS_ENABLED(CONFIG_MACSEC) /* MACsec management functions */ const struct macsec_ops *macsec_ops; #endif const struct udp_tunnel_nic_info *udp_tunnel_nic_info; struct udp_tunnel_nic *udp_tunnel_nic; /** @cfg: net_device queue-related configuration */ struct netdev_config *cfg; /** * @cfg_pending: same as @cfg but when device is being actively * reconfigured includes any changes to the configuration * requested by the user, but which may or may not be rejected. */ struct netdev_config *cfg_pending; struct ethtool_netdev_state *ethtool; /* protected by rtnl_lock */ struct bpf_xdp_entity xdp_state[__MAX_XDP_MODE]; u8 dev_addr_shadow[MAX_ADDR_LEN]; netdevice_tracker linkwatch_dev_tracker; netdevice_tracker watchdog_dev_tracker; netdevice_tracker dev_registered_tracker; struct rtnl_hw_stats64 *offload_xstats_l3; struct devlink_port *devlink_port; #if IS_ENABLED(CONFIG_DPLL) struct dpll_pin __rcu *dpll_pin; #endif #if IS_ENABLED(CONFIG_PAGE_POOL) /** @page_pools: page pools created for this netdevice */ struct hlist_head page_pools; #endif /** @irq_moder: dim parameters used if IS_ENABLED(CONFIG_DIMLIB). */ struct dim_irq_moder *irq_moder; u64 max_pacing_offload_horizon; struct napi_config *napi_config; u32 num_napi_configs; u32 napi_defer_hard_irqs; unsigned long gro_flush_timeout; /** * @up: copy of @state's IFF_UP, but safe to read with just @lock. * May report false negatives while the device is being opened * or closed (@lock does not protect .ndo_open, or .ndo_close). */ bool up; /** * @request_ops_lock: request the core to run all @netdev_ops and * @ethtool_ops under the @lock. */ bool request_ops_lock; /** * @lock: netdev-scope lock, protects a small selection of fields. * Should always be taken using netdev_lock() / netdev_unlock() helpers. * Drivers are free to use it for other protection. * * For the drivers that implement shaper or queue API, the scope * of this lock is expanded to cover most ndo/queue/ethtool/sysfs * operations. Drivers may opt-in to this behavior by setting * @request_ops_lock. * * @lock protection mixes with rtnl_lock in multiple ways, fields are * either: * * - simply protected by the instance @lock; * * - double protected - writers hold both locks, readers hold either; * * - ops protected - protected by the lock held around the NDOs * and other callbacks, that is the instance lock on devices for * which netdev_need_ops_lock() returns true, otherwise by rtnl_lock; * * - double ops protected - always protected by rtnl_lock but for * devices for which netdev_need_ops_lock() returns true - also * the instance lock. * * Simply protects: * @gro_flush_timeout, @napi_defer_hard_irqs, @napi_list, * @net_shaper_hierarchy, @reg_state, @threaded * * Double protects: * @up, @moving_ns, @nd_net, @xdp_features * * Double ops protects: * @real_num_rx_queues, @real_num_tx_queues * * Also protects some fields in: * struct napi_struct, struct netdev_queue, struct netdev_rx_queue * * Ordering: take after rtnl_lock. */ struct mutex lock; #if IS_ENABLED(CONFIG_NET_SHAPER) /** * @net_shaper_hierarchy: data tracking the current shaper status * see include/net/net_shapers.h */ struct net_shaper_hierarchy *net_shaper_hierarchy; #endif struct hlist_head neighbours[NEIGH_NR_TABLES]; struct hwtstamp_provider __rcu *hwprov; u8 priv[] ____cacheline_aligned __counted_by(priv_len); } ____cacheline_aligned; #define to_net_dev(d) container_of(d, struct net_device, dev) /* * Driver should use this to assign devlink port instance to a netdevice * before it registers the netdevice. Therefore devlink_port is static * during the netdev lifetime after it is registered. */ #define SET_NETDEV_DEVLINK_PORT(dev, port) \ ({ \ WARN_ON((dev)->reg_state != NETREG_UNINITIALIZED); \ ((dev)->devlink_port = (port)); \ }) static inline bool netif_elide_gro(const struct net_device *dev) { if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog) return true; return false; } #define NETDEV_ALIGN 32 static inline int netdev_get_prio_tc_map(const struct net_device *dev, u32 prio) { return dev->prio_tc_map[prio & TC_BITMASK]; } static inline int netdev_set_prio_tc_map(struct net_device *dev, u8 prio, u8 tc) { if (tc >= dev->num_tc) return -EINVAL; dev->prio_tc_map[prio & TC_BITMASK] = tc & TC_BITMASK; return 0; } int netdev_txq_to_tc(struct net_device *dev, unsigned int txq); void netdev_reset_tc(struct net_device *dev); int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset); int netdev_set_num_tc(struct net_device *dev, u8 num_tc); static inline int netdev_get_num_tc(struct net_device *dev) { return dev->num_tc; } static inline void net_prefetch(void *p) { prefetch(p); #if L1_CACHE_BYTES < 128 prefetch((u8 *)p + L1_CACHE_BYTES); #endif } static inline void net_prefetchw(void *p) { prefetchw(p); #if L1_CACHE_BYTES < 128 prefetchw((u8 *)p + L1_CACHE_BYTES); #endif } void netdev_unbind_sb_channel(struct net_device *dev, struct net_device *sb_dev); int netdev_bind_sb_channel_queue(struct net_device *dev, struct net_device *sb_dev, u8 tc, u16 count, u16 offset); int netdev_set_sb_channel(struct net_device *dev, u16 channel); static inline int netdev_get_sb_channel(struct net_device *dev) { return max_t(int, -dev->num_tc, 0); } static inline struct netdev_queue *netdev_get_tx_queue(const struct net_device *dev, unsigned int index) { DEBUG_NET_WARN_ON_ONCE(index >= dev->num_tx_queues); return &dev->_tx[index]; } static inline struct netdev_queue *skb_get_tx_queue(const struct net_device *dev, const struct sk_buff *skb) { return netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); } static inline void netdev_for_each_tx_queue(struct net_device *dev, void (*f)(struct net_device *, struct netdev_queue *, void *), void *arg) { unsigned int i; for (i = 0; i < dev->num_tx_queues; i++) f(dev, &dev->_tx[i], arg); } u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev); struct netdev_queue *netdev_core_pick_tx(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev); /* returns the headroom that the master device needs to take in account * when forwarding to this dev */ static inline unsigned netdev_get_fwd_headroom(struct net_device *dev) { return dev->priv_flags & IFF_PHONY_HEADROOM ? 0 : dev->needed_headroom; } static inline void netdev_set_rx_headroom(struct net_device *dev, int new_hr) { if (dev->netdev_ops->ndo_set_rx_headroom) dev->netdev_ops->ndo_set_rx_headroom(dev, new_hr); } /* set the device rx headroom to the dev's default */ static inline void netdev_reset_rx_headroom(struct net_device *dev) { netdev_set_rx_headroom(dev, -1); } static inline void *netdev_get_ml_priv(struct net_device *dev, enum netdev_ml_priv_type type) { if (dev->ml_priv_type != type) return NULL; return dev->ml_priv; } static inline void netdev_set_ml_priv(struct net_device *dev, void *ml_priv, enum netdev_ml_priv_type type) { WARN(dev->ml_priv_type && dev->ml_priv_type != type, "Overwriting already set ml_priv_type (%u) with different ml_priv_type (%u)!\n", dev->ml_priv_type, type); WARN(!dev->ml_priv_type && dev->ml_priv, "Overwriting already set ml_priv and ml_priv_type is ML_PRIV_NONE!\n"); dev->ml_priv = ml_priv; dev->ml_priv_type = type; } /* * Net namespace inlines */ static inline struct net *dev_net(const struct net_device *dev) { return read_pnet(&dev->nd_net); } static inline struct net *dev_net_rcu(const struct net_device *dev) { return read_pnet_rcu(&dev->nd_net); } static inline void dev_net_set(struct net_device *dev, struct net *net) { write_pnet(&dev->nd_net, net); } /** * netdev_priv - access network device private data * @dev: network device * * Get network device private data */ static inline void *netdev_priv(const struct net_device *dev) { return (void *)dev->priv; } /* Set the sysfs physical device reference for the network logical device * if set prior to registration will cause a symlink during initialization. */ #define SET_NETDEV_DEV(net, pdev) ((net)->dev.parent = (pdev)) /* Set the sysfs device type for the network logical device to allow * fine-grained identification of different network device types. For * example Ethernet, Wireless LAN, Bluetooth, WiMAX etc. */ #define SET_NETDEV_DEVTYPE(net, devtype) ((net)->dev.type = (devtype)) void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index, enum netdev_queue_type type, struct napi_struct *napi); static inline void netdev_lock(struct net_device *dev) { mutex_lock(&dev->lock); } static inline void netdev_unlock(struct net_device *dev) { mutex_unlock(&dev->lock); } /* Additional netdev_lock()-related helpers are in net/netdev_lock.h */ void netif_napi_set_irq_locked(struct napi_struct *napi, int irq); static inline void netif_napi_set_irq(struct napi_struct *napi, int irq) { netdev_lock(napi->dev); netif_napi_set_irq_locked(napi, irq); netdev_unlock(napi->dev); } /* Default NAPI poll() weight * Device drivers are strongly advised to not use bigger value */ #define NAPI_POLL_WEIGHT 64 void netif_napi_add_weight_locked(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight); static inline void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight) { netdev_lock(dev); netif_napi_add_weight_locked(dev, napi, poll, weight); netdev_unlock(dev); } /** * netif_napi_add() - initialize a NAPI context * @dev: network device * @napi: NAPI context * @poll: polling function * * netif_napi_add() must be used to initialize a NAPI context prior to calling * *any* of the other NAPI-related functions. */ static inline void netif_napi_add(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int)) { netif_napi_add_weight(dev, napi, poll, NAPI_POLL_WEIGHT); } static inline void netif_napi_add_locked(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int)) { netif_napi_add_weight_locked(dev, napi, poll, NAPI_POLL_WEIGHT); } static inline void netif_napi_add_tx_weight(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight) { set_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state); netif_napi_add_weight(dev, napi, poll, weight); } static inline void netif_napi_add_config_locked(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int index) { napi->index = index; napi->config = &dev->napi_config[index]; netif_napi_add_weight_locked(dev, napi, poll, NAPI_POLL_WEIGHT); } /** * netif_napi_add_config - initialize a NAPI context with persistent config * @dev: network device * @napi: NAPI context * @poll: polling function * @index: the NAPI index */ static inline void netif_napi_add_config(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int index) { netdev_lock(dev); netif_napi_add_config_locked(dev, napi, poll, index); netdev_unlock(dev); } /** * netif_napi_add_tx() - initialize a NAPI context to be used for Tx only * @dev: network device * @napi: NAPI context * @poll: polling function * * This variant of netif_napi_add() should be used from drivers using NAPI * to exclusively poll a TX queue. * This will avoid we add it into napi_hash[], thus polluting this hash table. */ static inline void netif_napi_add_tx(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int)) { netif_napi_add_tx_weight(dev, napi, poll, NAPI_POLL_WEIGHT); } void __netif_napi_del_locked(struct napi_struct *napi); /** * __netif_napi_del - remove a NAPI context * @napi: NAPI context * * Warning: caller must observe RCU grace period before freeing memory * containing @napi. Drivers might want to call this helper to combine * all the needed RCU grace periods into a single one. */ static inline void __netif_napi_del(struct napi_struct *napi) { netdev_lock(napi->dev); __netif_napi_del_locked(napi); netdev_unlock(napi->dev); } static inline void netif_napi_del_locked(struct napi_struct *napi) { __netif_napi_del_locked(napi); synchronize_net(); } /** * netif_napi_del - remove a NAPI context * @napi: NAPI context * * netif_napi_del() removes a NAPI context from the network device NAPI list */ static inline void netif_napi_del(struct napi_struct *napi) { __netif_napi_del(napi); synchronize_net(); } int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs); void netif_set_affinity_auto(struct net_device *dev); struct packet_type { __be16 type; /* This is really htons(ether_type). */ bool ignore_outgoing; struct net_device *dev; /* NULL is wildcarded here */ netdevice_tracker dev_tracker; int (*func) (struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *); void (*list_func) (struct list_head *, struct packet_type *, struct net_device *); bool (*id_match)(struct packet_type *ptype, struct sock *sk); struct net *af_packet_net; void *af_packet_priv; struct list_head list; }; struct offload_callbacks { struct sk_buff *(*gso_segment)(struct sk_buff *skb, netdev_features_t features); struct sk_buff *(*gro_receive)(struct list_head *head, struct sk_buff *skb); int (*gro_complete)(struct sk_buff *skb, int nhoff); }; struct packet_offload { __be16 type; /* This is really htons(ether_type). */ u16 priority; struct offload_callbacks callbacks; struct list_head list; }; /* often modified stats are per-CPU, other are shared (netdev->stats) */ struct pcpu_sw_netstats { u64_stats_t rx_packets; u64_stats_t rx_bytes; u64_stats_t tx_packets; u64_stats_t tx_bytes; struct u64_stats_sync syncp; } __aligned(4 * sizeof(u64)); struct pcpu_dstats { u64_stats_t rx_packets; u64_stats_t rx_bytes; u64_stats_t tx_packets; u64_stats_t tx_bytes; u64_stats_t rx_drops; u64_stats_t tx_drops; struct u64_stats_sync syncp; } __aligned(8 * sizeof(u64)); struct pcpu_lstats { u64_stats_t packets; u64_stats_t bytes; struct u64_stats_sync syncp; } __aligned(2 * sizeof(u64)); void dev_lstats_read(struct net_device *dev, u64 *packets, u64 *bytes); static inline void dev_sw_netstats_rx_add(struct net_device *dev, unsigned int len) { struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats); u64_stats_update_begin(&tstats->syncp); u64_stats_add(&tstats->rx_bytes, len); u64_stats_inc(&tstats->rx_packets); u64_stats_update_end(&tstats->syncp); } static inline void dev_sw_netstats_tx_add(struct net_device *dev, unsigned int packets, unsigned int len) { struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats); u64_stats_update_begin(&tstats->syncp); u64_stats_add(&tstats->tx_bytes, len); u64_stats_add(&tstats->tx_packets, packets); u64_stats_update_end(&tstats->syncp); } static inline void dev_lstats_add(struct net_device *dev, unsigned int len) { struct pcpu_lstats *lstats = this_cpu_ptr(dev->lstats); u64_stats_update_begin(&lstats->syncp); u64_stats_add(&lstats->bytes, len); u64_stats_inc(&lstats->packets); u64_stats_update_end(&lstats->syncp); } static inline void dev_dstats_rx_add(struct net_device *dev, unsigned int len) { struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); u64_stats_update_begin(&dstats->syncp); u64_stats_inc(&dstats->rx_packets); u64_stats_add(&dstats->rx_bytes, len); u64_stats_update_end(&dstats->syncp); } static inline void dev_dstats_rx_dropped(struct net_device *dev) { struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); u64_stats_update_begin(&dstats->syncp); u64_stats_inc(&dstats->rx_drops); u64_stats_update_end(&dstats->syncp); } static inline void dev_dstats_rx_dropped_add(struct net_device *dev, unsigned int packets) { struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); u64_stats_update_begin(&dstats->syncp); u64_stats_add(&dstats->rx_drops, packets); u64_stats_update_end(&dstats->syncp); } static inline void dev_dstats_tx_add(struct net_device *dev, unsigned int len) { struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); u64_stats_update_begin(&dstats->syncp); u64_stats_inc(&dstats->tx_packets); u64_stats_add(&dstats->tx_bytes, len); u64_stats_update_end(&dstats->syncp); } static inline void dev_dstats_tx_dropped(struct net_device *dev) { struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); u64_stats_update_begin(&dstats->syncp); u64_stats_inc(&dstats->tx_drops); u64_stats_update_end(&dstats->syncp); } #define __netdev_alloc_pcpu_stats(type, gfp) \ ({ \ typeof(type) __percpu *pcpu_stats = alloc_percpu_gfp(type, gfp);\ if (pcpu_stats) { \ int __cpu; \ for_each_possible_cpu(__cpu) { \ typeof(type) *stat; \ stat = per_cpu_ptr(pcpu_stats, __cpu); \ u64_stats_init(&stat->syncp); \ } \ } \ pcpu_stats; \ }) #define netdev_alloc_pcpu_stats(type) \ __netdev_alloc_pcpu_stats(type, GFP_KERNEL) #define devm_netdev_alloc_pcpu_stats(dev, type) \ ({ \ typeof(type) __percpu *pcpu_stats = devm_alloc_percpu(dev, type);\ if (pcpu_stats) { \ int __cpu; \ for_each_possible_cpu(__cpu) { \ typeof(type) *stat; \ stat = per_cpu_ptr(pcpu_stats, __cpu); \ u64_stats_init(&stat->syncp); \ } \ } \ pcpu_stats; \ }) enum netdev_lag_tx_type { NETDEV_LAG_TX_TYPE_UNKNOWN, NETDEV_LAG_TX_TYPE_RANDOM, NETDEV_LAG_TX_TYPE_BROADCAST, NETDEV_LAG_TX_TYPE_ROUNDROBIN, NETDEV_LAG_TX_TYPE_ACTIVEBACKUP, NETDEV_LAG_TX_TYPE_HASH, }; enum netdev_lag_hash { NETDEV_LAG_HASH_NONE, NETDEV_LAG_HASH_L2, NETDEV_LAG_HASH_L34, NETDEV_LAG_HASH_L23, NETDEV_LAG_HASH_E23, NETDEV_LAG_HASH_E34, NETDEV_LAG_HASH_VLAN_SRCMAC, NETDEV_LAG_HASH_UNKNOWN, }; struct netdev_lag_upper_info { enum netdev_lag_tx_type tx_type; enum netdev_lag_hash hash_type; }; struct netdev_lag_lower_state_info { u8 link_up : 1, tx_enabled : 1; }; #include <linux/notifier.h> /* netdevice notifier chain. Please remember to update netdev_cmd_to_name() * and the rtnetlink notification exclusion list in rtnetlink_event() when * adding new types. */ enum netdev_cmd { NETDEV_UP = 1, /* For now you can't veto a device up/down */ NETDEV_DOWN, NETDEV_REBOOT, /* Tell a protocol stack a network interface detected a hardware crash and restarted - we can use this eg to kick tcp sessions once done */ NETDEV_CHANGE, /* Notify device state change */ NETDEV_REGISTER, NETDEV_UNREGISTER, NETDEV_CHANGEMTU, /* notify after mtu change happened */ NETDEV_CHANGEADDR, /* notify after the address change */ NETDEV_PRE_CHANGEADDR, /* notify before the address change */ NETDEV_GOING_DOWN, NETDEV_CHANGENAME, NETDEV_FEAT_CHANGE, NETDEV_BONDING_FAILOVER, NETDEV_PRE_UP, NETDEV_PRE_TYPE_CHANGE, NETDEV_POST_TYPE_CHANGE, NETDEV_POST_INIT, NETDEV_PRE_UNINIT, NETDEV_RELEASE, NETDEV_NOTIFY_PEERS, NETDEV_JOIN, NETDEV_CHANGEUPPER, NETDEV_RESEND_IGMP, NETDEV_PRECHANGEMTU, /* notify before mtu change happened */ NETDEV_CHANGEINFODATA, NETDEV_BONDING_INFO, NETDEV_PRECHANGEUPPER, NETDEV_CHANGELOWERSTATE, NETDEV_UDP_TUNNEL_PUSH_INFO, NETDEV_UDP_TUNNEL_DROP_INFO, NETDEV_CHANGE_TX_QUEUE_LEN, NETDEV_CVLAN_FILTER_PUSH_INFO, NETDEV_CVLAN_FILTER_DROP_INFO, NETDEV_SVLAN_FILTER_PUSH_INFO, NETDEV_SVLAN_FILTER_DROP_INFO, NETDEV_OFFLOAD_XSTATS_ENABLE, NETDEV_OFFLOAD_XSTATS_DISABLE, NETDEV_OFFLOAD_XSTATS_REPORT_USED, NETDEV_OFFLOAD_XSTATS_REPORT_DELTA, NETDEV_XDP_FEAT_CHANGE, }; const char *netdev_cmd_to_name(enum netdev_cmd cmd); int register_netdevice_notifier(struct notifier_block *nb); int unregister_netdevice_notifier(struct notifier_block *nb); int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb); int unregister_netdevice_notifier_net(struct net *net, struct notifier_block *nb); int register_netdevice_notifier_dev_net(struct net_device *dev, struct notifier_block *nb, struct netdev_net_notifier *nn); int unregister_netdevice_notifier_dev_net(struct net_device *dev, struct notifier_block *nb, struct netdev_net_notifier *nn); struct netdev_notifier_info { struct net_device *dev; struct netlink_ext_ack *extack; }; struct netdev_notifier_info_ext { struct netdev_notifier_info info; /* must be first */ union { u32 mtu; } ext; }; struct netdev_notifier_change_info { struct netdev_notifier_info info; /* must be first */ unsigned int flags_changed; }; struct netdev_notifier_changeupper_info { struct netdev_notifier_info info; /* must be first */ struct net_device *upper_dev; /* new upper dev */ bool master; /* is upper dev master */ bool linking; /* is the notification for link or unlink */ void *upper_info; /* upper dev info */ }; struct netdev_notifier_changelowerstate_info { struct netdev_notifier_info info; /* must be first */ void *lower_state_info; /* is lower dev state */ }; struct netdev_notifier_pre_changeaddr_info { struct netdev_notifier_info info; /* must be first */ const unsigned char *dev_addr; }; enum netdev_offload_xstats_type { NETDEV_OFFLOAD_XSTATS_TYPE_L3 = 1, }; struct netdev_notifier_offload_xstats_info { struct netdev_notifier_info info; /* must be first */ enum netdev_offload_xstats_type type; union { /* NETDEV_OFFLOAD_XSTATS_REPORT_DELTA */ struct netdev_notifier_offload_xstats_rd *report_delta; /* NETDEV_OFFLOAD_XSTATS_REPORT_USED */ struct netdev_notifier_offload_xstats_ru *report_used; }; }; int netdev_offload_xstats_enable(struct net_device *dev, enum netdev_offload_xstats_type type, struct netlink_ext_ack *extack); int netdev_offload_xstats_disable(struct net_device *dev, enum netdev_offload_xstats_type type); bool netdev_offload_xstats_enabled(const struct net_device *dev, enum netdev_offload_xstats_type type); int netdev_offload_xstats_get(struct net_device *dev, enum netdev_offload_xstats_type type, struct rtnl_hw_stats64 *stats, bool *used, struct netlink_ext_ack *extack); void netdev_offload_xstats_report_delta(struct netdev_notifier_offload_xstats_rd *rd, const struct rtnl_hw_stats64 *stats); void netdev_offload_xstats_report_used(struct netdev_notifier_offload_xstats_ru *ru); void netdev_offload_xstats_push_delta(struct net_device *dev, enum netdev_offload_xstats_type type, const struct rtnl_hw_stats64 *stats); static inline void netdev_notifier_info_init(struct netdev_notifier_info *info, struct net_device *dev) { info->dev = dev; info->extack = NULL; } static inline struct net_device * netdev_notifier_info_to_dev(const struct netdev_notifier_info *info) { return info->dev; } static inline struct netlink_ext_ack * netdev_notifier_info_to_extack(const struct netdev_notifier_info *info) { return info->extack; } int call_netdevice_notifiers(unsigned long val, struct net_device *dev); int call_netdevice_notifiers_info(unsigned long val, struct netdev_notifier_info *info); #define for_each_netdev(net, d) \ list_for_each_entry(d, &(net)->dev_base_head, dev_list) #define for_each_netdev_reverse(net, d) \ list_for_each_entry_reverse(d, &(net)->dev_base_head, dev_list) #define for_each_netdev_rcu(net, d) \ list_for_each_entry_rcu(d, &(net)->dev_base_head, dev_list) #define for_each_netdev_safe(net, d, n) \ list_for_each_entry_safe(d, n, &(net)->dev_base_head, dev_list) #define for_each_netdev_continue(net, d) \ list_for_each_entry_continue(d, &(net)->dev_base_head, dev_list) #define for_each_netdev_continue_reverse(net, d) \ list_for_each_entry_continue_reverse(d, &(net)->dev_base_head, \ dev_list) #define for_each_netdev_continue_rcu(net, d) \ list_for_each_entry_continue_rcu(d, &(net)->dev_base_head, dev_list) #define for_each_netdev_in_bond_rcu(bond, slave) \ for_each_netdev_rcu(dev_net_rcu(bond), slave) \ if (netdev_master_upper_dev_get_rcu(slave) == (bond)) #define net_device_entry(lh) list_entry(lh, struct net_device, dev_list) #define for_each_netdev_dump(net, d, ifindex) \ for (; (d = xa_find(&(net)->dev_by_index, &ifindex, \ ULONG_MAX, XA_PRESENT)); ifindex++) static inline struct net_device *next_net_device(struct net_device *dev) { struct list_head *lh; struct net *net; net = dev_net(dev); lh = dev->dev_list.next; return lh == &net->dev_base_head ? NULL : net_device_entry(lh); } static inline struct net_device *next_net_device_rcu(struct net_device *dev) { struct list_head *lh; struct net *net; net = dev_net(dev); lh = rcu_dereference(list_next_rcu(&dev->dev_list)); return lh == &net->dev_base_head ? NULL : net_device_entry(lh); } static inline struct net_device *first_net_device(struct net *net) { return list_empty(&net->dev_base_head) ? NULL : net_device_entry(net->dev_base_head.next); } int netdev_boot_setup_check(struct net_device *dev); struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, const char *hwaddr); struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, const char *hwaddr); struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type); void dev_add_pack(struct packet_type *pt); void dev_remove_pack(struct packet_type *pt); void __dev_remove_pack(struct packet_type *pt); void dev_add_offload(struct packet_offload *po); void dev_remove_offload(struct packet_offload *po); int dev_get_iflink(const struct net_device *dev); int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb); int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr, struct net_device_path_stack *stack); struct net_device *dev_get_by_name(struct net *net, const char *name); struct net_device *dev_get_by_name_rcu(struct net *net, const char *name); struct net_device *__dev_get_by_name(struct net *net, const char *name); bool netdev_name_in_use(struct net *net, const char *name); int dev_alloc_name(struct net_device *dev, const char *name); int netif_open(struct net_device *dev, struct netlink_ext_ack *extack); int dev_open(struct net_device *dev, struct netlink_ext_ack *extack); void netif_close(struct net_device *dev); void dev_close(struct net_device *dev); void netif_close_many(struct list_head *head, bool unlink); void netif_disable_lro(struct net_device *dev); void dev_disable_lro(struct net_device *dev); int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *newskb); u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev); int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev); int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id); static inline int dev_queue_xmit(struct sk_buff *skb) { return __dev_queue_xmit(skb, NULL); } static inline int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev) { return __dev_queue_xmit(skb, sb_dev); } static inline int dev_direct_xmit(struct sk_buff *skb, u16 queue_id) { int ret; ret = __dev_direct_xmit(skb, queue_id); if (!dev_xmit_complete(ret)) kfree_skb(skb); return ret; } int register_netdevice(struct net_device *dev); void unregister_netdevice_queue(struct net_device *dev, struct list_head *head); void unregister_netdevice_many(struct list_head *head); static inline void unregister_netdevice(struct net_device *dev) { unregister_netdevice_queue(dev, NULL); } int netdev_refcnt_read(const struct net_device *dev); void free_netdev(struct net_device *dev); struct net_device *netdev_get_xmit_slave(struct net_device *dev, struct sk_buff *skb, bool all_slaves); struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev, struct sock *sk); struct net_device *dev_get_by_index(struct net *net, int ifindex); struct net_device *__dev_get_by_index(struct net *net, int ifindex); struct net_device *netdev_get_by_index(struct net *net, int ifindex, netdevice_tracker *tracker, gfp_t gfp); struct net_device *netdev_get_by_name(struct net *net, const char *name, netdevice_tracker *tracker, gfp_t gfp); struct net_device *netdev_get_by_flags_rcu(struct net *net, netdevice_tracker *tracker, unsigned short flags, unsigned short mask); struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex); void netdev_copy_name(struct net_device *dev, char *name); static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned int len) { if (!dev->header_ops || !dev->header_ops->create) return 0; return dev->header_ops->create(skb, dev, type, daddr, saddr, len); } static inline int dev_parse_header(const struct sk_buff *skb, unsigned char *haddr) { const struct net_device *dev = skb->dev; if (!dev->header_ops || !dev->header_ops->parse) return 0; return dev->header_ops->parse(skb, haddr); } static inline __be16 dev_parse_header_protocol(const struct sk_buff *skb) { const struct net_device *dev = skb->dev; if (!dev->header_ops || !dev->header_ops->parse_protocol) return 0; return dev->header_ops->parse_protocol(skb); } /* ll_header must have at least hard_header_len allocated */ static inline bool dev_validate_header(const struct net_device *dev, char *ll_header, int len) { if (likely(len >= dev->hard_header_len)) return true; if (len < dev->min_header_len) return false; if (capable(CAP_SYS_RAWIO)) { memset(ll_header + len, 0, dev->hard_header_len - len); return true; } if (dev->header_ops && dev->header_ops->validate) return dev->header_ops->validate(ll_header, len); return false; } static inline bool dev_has_header(const struct net_device *dev) { return dev->header_ops && dev->header_ops->create; } struct numa_drop_counters { atomic_t drops0 ____cacheline_aligned_in_smp; atomic_t drops1 ____cacheline_aligned_in_smp; }; static inline int numa_drop_read(const struct numa_drop_counters *ndc) { return atomic_read(&ndc->drops0) + atomic_read(&ndc->drops1); } static inline void numa_drop_add(struct numa_drop_counters *ndc, int val) { int n = numa_node_id() % 2; if (n) atomic_add(val, &ndc->drops1); else atomic_add(val, &ndc->drops0); } static inline void numa_drop_reset(struct numa_drop_counters *ndc) { atomic_set(&ndc->drops0, 0); atomic_set(&ndc->drops1, 0); } /* * Incoming packets are placed on per-CPU queues */ struct softnet_data { struct list_head poll_list; struct sk_buff_head process_queue; local_lock_t process_queue_bh_lock; /* stats */ unsigned int processed; unsigned int time_squeeze; #ifdef CONFIG_RPS struct softnet_data *rps_ipi_list; #endif unsigned int received_rps; bool in_net_rx_action; bool in_napi_threaded_poll; #ifdef CONFIG_NET_FLOW_LIMIT struct sd_flow_limit __rcu *flow_limit; #endif struct Qdisc *output_queue; struct Qdisc **output_queue_tailp; struct sk_buff *completion_queue; #ifdef CONFIG_XFRM_OFFLOAD struct sk_buff_head xfrm_backlog; #endif /* written and read only by owning cpu: */ struct netdev_xmit xmit; #ifdef CONFIG_RPS /* input_queue_head should be written by cpu owning this struct, * and only read by other cpus. Worth using a cache line. */ unsigned int input_queue_head ____cacheline_aligned_in_smp; /* Elements below can be accessed between CPUs for RPS/RFS */ call_single_data_t csd ____cacheline_aligned_in_smp; struct softnet_data *rps_ipi_next; unsigned int cpu; unsigned int input_queue_tail; #endif struct sk_buff_head input_pkt_queue; struct napi_struct backlog; struct numa_drop_counters drop_counters; int defer_ipi_scheduled ____cacheline_aligned_in_smp; call_single_data_t defer_csd; }; DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); struct page_pool_bh { struct page_pool *pool; local_lock_t bh_lock; }; DECLARE_PER_CPU(struct page_pool_bh, system_page_pool); #ifndef CONFIG_PREEMPT_RT static inline int dev_recursion_level(void) { return this_cpu_read(softnet_data.xmit.recursion); } #else static inline int dev_recursion_level(void) { return current->net_xmit.recursion; } #endif void __netif_schedule(struct Qdisc *q); void netif_schedule_queue(struct netdev_queue *txq); static inline void netif_tx_schedule_all(struct net_device *dev) { unsigned int i; for (i = 0; i < dev->num_tx_queues; i++) netif_schedule_queue(netdev_get_tx_queue(dev, i)); } static __always_inline void netif_tx_start_queue(struct netdev_queue *dev_queue) { clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state); } /** * netif_start_queue - allow transmit * @dev: network device * * Allow upper layers to call the device hard_start_xmit routine. */ static inline void netif_start_queue(struct net_device *dev) { netif_tx_start_queue(netdev_get_tx_queue(dev, 0)); } static inline void netif_tx_start_all_queues(struct net_device *dev) { unsigned int i; for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); netif_tx_start_queue(txq); } } void netif_tx_wake_queue(struct netdev_queue *dev_queue); /** * netif_wake_queue - restart transmit * @dev: network device * * Allow upper layers to call the device hard_start_xmit routine. * Used for flow control when transmit resources are available. */ static inline void netif_wake_queue(struct net_device *dev) { netif_tx_wake_queue(netdev_get_tx_queue(dev, 0)); } static inline void netif_tx_wake_all_queues(struct net_device *dev) { unsigned int i; for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); netif_tx_wake_queue(txq); } } static __always_inline void netif_tx_stop_queue(struct netdev_queue *dev_queue) { /* Paired with READ_ONCE() from dev_watchdog() */ WRITE_ONCE(dev_queue->trans_start, jiffies); /* This barrier is paired with smp_mb() from dev_watchdog() */ smp_mb__before_atomic(); /* Must be an atomic op see netif_txq_try_stop() */ set_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state); } /** * netif_stop_queue - stop transmitted packets * @dev: network device * * Stop upper layers calling the device hard_start_xmit routine. * Used for flow control when transmit resources are unavailable. */ static inline void netif_stop_queue(struct net_device *dev) { netif_tx_stop_queue(netdev_get_tx_queue(dev, 0)); } void netif_tx_stop_all_queues(struct net_device *dev); static inline bool netif_tx_queue_stopped(const struct netdev_queue *dev_queue) { return test_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state); } /** * netif_queue_stopped - test if transmit queue is flowblocked * @dev: network device * * Test if transmit queue on device is currently unable to send. */ static inline bool netif_queue_stopped(const struct net_device *dev) { return netif_tx_queue_stopped(netdev_get_tx_queue(dev, 0)); } static inline bool netif_xmit_stopped(const struct netdev_queue *dev_queue) { return dev_queue->state & QUEUE_STATE_ANY_XOFF; } static inline bool netif_xmit_frozen_or_stopped(const struct netdev_queue *dev_queue) { return dev_queue->state & QUEUE_STATE_ANY_XOFF_OR_FROZEN; } static inline bool netif_xmit_frozen_or_drv_stopped(const struct netdev_queue *dev_queue) { return dev_queue->state & QUEUE_STATE_DRV_XOFF_OR_FROZEN; } /** * netdev_queue_set_dql_min_limit - set dql minimum limit * @dev_queue: pointer to transmit queue * @min_limit: dql minimum limit * * Forces xmit_more() to return true until the minimum threshold * defined by @min_limit is reached (or until the tx queue is * empty). Warning: to be use with care, misuse will impact the * latency. */ static inline void netdev_queue_set_dql_min_limit(struct netdev_queue *dev_queue, unsigned int min_limit) { #ifdef CONFIG_BQL dev_queue->dql.min_limit = min_limit; #endif } static inline int netdev_queue_dql_avail(const struct netdev_queue *txq) { #ifdef CONFIG_BQL /* Non-BQL migrated drivers will return 0, too. */ return dql_avail(&txq->dql); #else return 0; #endif } /** * netdev_txq_bql_enqueue_prefetchw - prefetch bql data for write * @dev_queue: pointer to transmit queue * * BQL enabled drivers might use this helper in their ndo_start_xmit(), * to give appropriate hint to the CPU. */ static inline void netdev_txq_bql_enqueue_prefetchw(struct netdev_queue *dev_queue) { #ifdef CONFIG_BQL prefetchw(&dev_queue->dql.num_queued); #endif } /** * netdev_txq_bql_complete_prefetchw - prefetch bql data for write * @dev_queue: pointer to transmit queue * * BQL enabled drivers might use this helper in their TX completion path, * to give appropriate hint to the CPU. */ static inline void netdev_txq_bql_complete_prefetchw(struct netdev_queue *dev_queue) { #ifdef CONFIG_BQL prefetchw(&dev_queue->dql.limit); #endif } /** * netdev_tx_sent_queue - report the number of bytes queued to a given tx queue * @dev_queue: network device queue * @bytes: number of bytes queued to the device queue * * Report the number of bytes queued for sending/completion to the network * device hardware queue. @bytes should be a good approximation and should * exactly match netdev_completed_queue() @bytes. * This is typically called once per packet, from ndo_start_xmit(). */ static inline void netdev_tx_sent_queue(struct netdev_queue *dev_queue, unsigned int bytes) { #ifdef CONFIG_BQL dql_queued(&dev_queue->dql, bytes); if (likely(dql_avail(&dev_queue->dql) >= 0)) return; /* Paired with READ_ONCE() from dev_watchdog() */ WRITE_ONCE(dev_queue->trans_start, jiffies); /* This barrier is paired with smp_mb() from dev_watchdog() */ smp_mb__before_atomic(); set_bit(__QUEUE_STATE_STACK_XOFF, &dev_queue->state); /* * The XOFF flag must be set before checking the dql_avail below, * because in netdev_tx_completed_queue we update the dql_completed * before checking the XOFF flag. */ smp_mb__after_atomic(); /* check again in case another CPU has just made room avail */ if (unlikely(dql_avail(&dev_queue->dql) >= 0)) clear_bit(__QUEUE_STATE_STACK_XOFF, &dev_queue->state); #endif } /* Variant of netdev_tx_sent_queue() for drivers that are aware * that they should not test BQL status themselves. * We do want to change __QUEUE_STATE_STACK_XOFF only for the last * skb of a batch. * Returns true if the doorbell must be used to kick the NIC. */ static inline bool __netdev_tx_sent_queue(struct netdev_queue *dev_queue, unsigned int bytes, bool xmit_more) { if (xmit_more) { #ifdef CONFIG_BQL dql_queued(&dev_queue->dql, bytes); #endif return netif_tx_queue_stopped(dev_queue); } netdev_tx_sent_queue(dev_queue, bytes); return true; } /** * netdev_sent_queue - report the number of bytes queued to hardware * @dev: network device * @bytes: number of bytes queued to the hardware device queue * * Report the number of bytes queued for sending/completion to the network * device hardware queue#0. @bytes should be a good approximation and should * exactly match netdev_completed_queue() @bytes. * This is typically called once per packet, from ndo_start_xmit(). */ static inline void netdev_sent_queue(struct net_device *dev, unsigned int bytes) { netdev_tx_sent_queue(netdev_get_tx_queue(dev, 0), bytes); } static inline bool __netdev_sent_queue(struct net_device *dev, unsigned int bytes, bool xmit_more) { return __netdev_tx_sent_queue(netdev_get_tx_queue(dev, 0), bytes, xmit_more); } /** * netdev_tx_completed_queue - report number of packets/bytes at TX completion. * @dev_queue: network device queue * @pkts: number of packets (currently ignored) * @bytes: number of bytes dequeued from the device queue * * Must be called at most once per TX completion round (and not per * individual packet), so that BQL can adjust its limits appropriately. */ static inline void netdev_tx_completed_queue(struct netdev_queue *dev_queue, unsigned int pkts, unsigned int bytes) { #ifdef CONFIG_BQL if (unlikely(!bytes)) return; dql_completed(&dev_queue->dql, bytes); /* * Without the memory barrier there is a small possibility that * netdev_tx_sent_queue will miss the update and cause the queue to * be stopped forever */ smp_mb(); /* NOTE: netdev_txq_completed_mb() assumes this exists */ if (unlikely(dql_avail(&dev_queue->dql) < 0)) return; if (test_and_clear_bit(__QUEUE_STATE_STACK_XOFF, &dev_queue->state)) netif_schedule_queue(dev_queue); #endif } /** * netdev_completed_queue - report bytes and packets completed by device * @dev: network device * @pkts: actual number of packets sent over the medium * @bytes: actual number of bytes sent over the medium * * Report the number of bytes and packets transmitted by the network device * hardware queue over the physical medium, @bytes must exactly match the * @bytes amount passed to netdev_sent_queue() */ static inline void netdev_completed_queue(struct net_device *dev, unsigned int pkts, unsigned int bytes) { netdev_tx_completed_queue(netdev_get_tx_queue(dev, 0), pkts, bytes); } static inline void netdev_tx_reset_queue(struct netdev_queue *q) { #ifdef CONFIG_BQL clear_bit(__QUEUE_STATE_STACK_XOFF, &q->state); dql_reset(&q->dql); #endif } /** * netdev_tx_reset_subqueue - reset the BQL stats and state of a netdev queue * @dev: network device * @qid: stack index of the queue to reset */ static inline void netdev_tx_reset_subqueue(const struct net_device *dev, u32 qid) { netdev_tx_reset_queue(netdev_get_tx_queue(dev, qid)); } /** * netdev_reset_queue - reset the packets and bytes count of a network device * @dev_queue: network device * * Reset the bytes and packet count of a network device and clear the * software flow control OFF bit for this network device */ static inline void netdev_reset_queue(struct net_device *dev_queue) { netdev_tx_reset_subqueue(dev_queue, 0); } /** * netdev_cap_txqueue - check if selected tx queue exceeds device queues * @dev: network device * @queue_index: given tx queue index * * Returns 0 if given tx queue index >= number of device tx queues, * otherwise returns the originally passed tx queue index. */ static inline u16 netdev_cap_txqueue(struct net_device *dev, u16 queue_index) { if (unlikely(queue_index >= dev->real_num_tx_queues)) { net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n", dev->name, queue_index, dev->real_num_tx_queues); return 0; } return queue_index; } /** * netif_running - test if up * @dev: network device * * Test if the device has been brought up. */ static inline bool netif_running(const struct net_device *dev) { return test_bit(__LINK_STATE_START, &dev->state); } /* * Routines to manage the subqueues on a device. We only need start, * stop, and a check if it's stopped. All other device management is * done at the overall netdevice level. * Also test the device if we're multiqueue. */ /** * netif_start_subqueue - allow sending packets on subqueue * @dev: network device * @queue_index: sub queue index * * Start individual transmit queue of a device with multiple transmit queues. */ static inline void netif_start_subqueue(struct net_device *dev, u16 queue_index) { struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index); netif_tx_start_queue(txq); } /** * netif_stop_subqueue - stop sending packets on subqueue * @dev: network device * @queue_index: sub queue index * * Stop individual transmit queue of a device with multiple transmit queues. */ static inline void netif_stop_subqueue(struct net_device *dev, u16 queue_index) { struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index); netif_tx_stop_queue(txq); } /** * __netif_subqueue_stopped - test status of subqueue * @dev: network device * @queue_index: sub queue index * * Check individual transmit queue of a device with multiple transmit queues. */ static inline bool __netif_subqueue_stopped(const struct net_device *dev, u16 queue_index) { struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index); return netif_tx_queue_stopped(txq); } /** * netif_subqueue_stopped - test status of subqueue * @dev: network device * @skb: sub queue buffer pointer * * Check individual transmit queue of a device with multiple transmit queues. */ static inline bool netif_subqueue_stopped(const struct net_device *dev, struct sk_buff *skb) { return __netif_subqueue_stopped(dev, skb_get_queue_mapping(skb)); } /** * netif_wake_subqueue - allow sending packets on subqueue * @dev: network device * @queue_index: sub queue index * * Resume individual transmit queue of a device with multiple transmit queues. */ static inline void netif_wake_subqueue(struct net_device *dev, u16 queue_index) { struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index); netif_tx_wake_queue(txq); } #ifdef CONFIG_XPS int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, u16 index); int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, u16 index, enum xps_map_type type); /** * netif_attr_test_mask - Test a CPU or Rx queue set in a mask * @j: CPU/Rx queue index * @mask: bitmask of all cpus/rx queues * @nr_bits: number of bits in the bitmask * * Test if a CPU or Rx queue index is set in a mask of all CPU/Rx queues. */ static inline bool netif_attr_test_mask(unsigned long j, const unsigned long *mask, unsigned int nr_bits) { cpu_max_bits_warn(j, nr_bits); return test_bit(j, mask); } /** * netif_attr_test_online - Test for online CPU/Rx queue * @j: CPU/Rx queue index * @online_mask: bitmask for CPUs/Rx queues that are online * @nr_bits: number of bits in the bitmask * * Returns: true if a CPU/Rx queue is online. */ static inline bool netif_attr_test_online(unsigned long j, const unsigned long *online_mask, unsigned int nr_bits) { cpu_max_bits_warn(j, nr_bits); if (online_mask) return test_bit(j, online_mask); return (j < nr_bits); } /** * netif_attrmask_next - get the next CPU/Rx queue in a cpu/Rx queues mask * @n: CPU/Rx queue index * @srcp: the cpumask/Rx queue mask pointer * @nr_bits: number of bits in the bitmask * * Returns: next (after n) CPU/Rx queue index in the mask; * >= nr_bits if no further CPUs/Rx queues set. */ static inline unsigned int netif_attrmask_next(int n, const unsigned long *srcp, unsigned int nr_bits) { /* -1 is a legal arg here. */ if (n != -1) cpu_max_bits_warn(n, nr_bits); if (srcp) return find_next_bit(srcp, nr_bits, n + 1); return n + 1; } /** * netif_attrmask_next_and - get the next CPU/Rx queue in \*src1p & \*src2p * @n: CPU/Rx queue index * @src1p: the first CPUs/Rx queues mask pointer * @src2p: the second CPUs/Rx queues mask pointer * @nr_bits: number of bits in the bitmask * * Returns: next (after n) CPU/Rx queue index set in both masks; * >= nr_bits if no further CPUs/Rx queues set in both. */ static inline int netif_attrmask_next_and(int n, const unsigned long *src1p, const unsigned long *src2p, unsigned int nr_bits) { /* -1 is a legal arg here. */ if (n != -1) cpu_max_bits_warn(n, nr_bits); if (src1p && src2p) return find_next_and_bit(src1p, src2p, nr_bits, n + 1); else if (src1p) return find_next_bit(src1p, nr_bits, n + 1); else if (src2p) return find_next_bit(src2p, nr_bits, n + 1); return n + 1; } #else static inline int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, u16 index) { return 0; } static inline int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, u16 index, enum xps_map_type type) { return 0; } #endif /** * netif_is_multiqueue - test if device has multiple transmit queues * @dev: network device * * Check if device has multiple transmit queues */ static inline bool netif_is_multiqueue(const struct net_device *dev) { return dev->num_tx_queues > 1; } int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq); int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq); int netif_set_real_num_queues(struct net_device *dev, unsigned int txq, unsigned int rxq); int netif_get_num_default_rss_queues(void); void dev_kfree_skb_irq_reason(struct sk_buff *skb, enum skb_drop_reason reason); void dev_kfree_skb_any_reason(struct sk_buff *skb, enum skb_drop_reason reason); /* * It is not allowed to call kfree_skb() or consume_skb() from hardware * interrupt context or with hardware interrupts being disabled. * (in_hardirq() || irqs_disabled()) * * We provide four helpers that can be used in following contexts : * * dev_kfree_skb_irq(skb) when caller drops a packet from irq context, * replacing kfree_skb(skb) * * dev_consume_skb_irq(skb) when caller consumes a packet from irq context. * Typically used in place of consume_skb(skb) in TX completion path * * dev_kfree_skb_any(skb) when caller doesn't know its current irq context, * replacing kfree_skb(skb) * * dev_consume_skb_any(skb) when caller doesn't know its current irq context, * and consumed a packet. Used in place of consume_skb(skb) */ static inline void dev_kfree_skb_irq(struct sk_buff *skb) { dev_kfree_skb_irq_reason(skb, SKB_DROP_REASON_NOT_SPECIFIED); } static inline void dev_consume_skb_irq(struct sk_buff *skb) { dev_kfree_skb_irq_reason(skb, SKB_CONSUMED); } static inline void dev_kfree_skb_any(struct sk_buff *skb) { dev_kfree_skb_any_reason(skb, SKB_DROP_REASON_NOT_SPECIFIED); } static inline void dev_consume_skb_any(struct sk_buff *skb) { dev_kfree_skb_any_reason(skb, SKB_CONSUMED); } u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, const struct bpf_prog *xdp_prog); void generic_xdp_tx(struct sk_buff *skb, const struct bpf_prog *xdp_prog); int do_xdp_generic(const struct bpf_prog *xdp_prog, struct sk_buff **pskb); int netif_rx(struct sk_buff *skb); int __netif_rx(struct sk_buff *skb); int netif_receive_skb(struct sk_buff *skb); int netif_receive_skb_core(struct sk_buff *skb); void netif_receive_skb_list_internal(struct list_head *head); void netif_receive_skb_list(struct list_head *head); gro_result_t gro_receive_skb(struct gro_node *gro, struct sk_buff *skb); static inline gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { return gro_receive_skb(&napi->gro, skb); } struct sk_buff *napi_get_frags(struct napi_struct *napi); gro_result_t napi_gro_frags(struct napi_struct *napi); static inline void napi_free_frags(struct napi_struct *napi) { kfree_skb(napi->skb); napi->skb = NULL; } bool netdev_is_rx_handler_busy(struct net_device *dev); int netdev_rx_handler_register(struct net_device *dev, rx_handler_func_t *rx_handler, void *rx_handler_data); void netdev_rx_handler_unregister(struct net_device *dev); bool dev_valid_name(const char *name); static inline bool is_socket_ioctl_cmd(unsigned int cmd) { return _IOC_TYPE(cmd) == SOCK_IOC_TYPE; } int get_user_ifreq(struct ifreq *ifr, void __user **ifrdata, void __user *arg); int put_user_ifreq(struct ifreq *ifr, void __user *arg); int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, void __user *data, bool *need_copyout); int dev_ifconf(struct net *net, struct ifconf __user *ifc); int dev_eth_ioctl(struct net_device *dev, struct ifreq *ifr, unsigned int cmd); int generic_hwtstamp_get_lower(struct net_device *dev, struct kernel_hwtstamp_config *kernel_cfg); int generic_hwtstamp_set_lower(struct net_device *dev, struct kernel_hwtstamp_config *kernel_cfg, struct netlink_ext_ack *extack); int dev_ethtool(struct net *net, struct ifreq *ifr, void __user *userdata); unsigned int netif_get_flags(const struct net_device *dev); int __dev_change_flags(struct net_device *dev, unsigned int flags, struct netlink_ext_ack *extack); int netif_change_flags(struct net_device *dev, unsigned int flags, struct netlink_ext_ack *extack); int dev_change_flags(struct net_device *dev, unsigned int flags, struct netlink_ext_ack *extack); int netif_set_alias(struct net_device *dev, const char *alias, size_t len); int dev_set_alias(struct net_device *, const char *, size_t); int dev_get_alias(const struct net_device *, char *, size_t); int __dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat, int new_ifindex, struct netlink_ext_ack *extack); int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat); int __netif_set_mtu(struct net_device *dev, int new_mtu); int netif_set_mtu(struct net_device *dev, int new_mtu); int dev_set_mtu(struct net_device *, int); int netif_pre_changeaddr_notify(struct net_device *dev, const char *addr, struct netlink_ext_ack *extack); int netif_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss, struct netlink_ext_ack *extack); int dev_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss, struct netlink_ext_ack *extack); int dev_set_mac_address_user(struct net_device *dev, struct sockaddr_storage *ss, struct netlink_ext_ack *extack); int netif_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name); int netif_get_port_parent_id(struct net_device *dev, struct netdev_phys_item_id *ppid, bool recurse); bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b); struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again); struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq, int *ret); int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); u8 dev_xdp_prog_count(struct net_device *dev); int netif_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf); int dev_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf); u8 dev_xdp_sb_prog_count(struct net_device *dev); u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode); u32 dev_get_min_mp_channel_count(const struct net_device *dev); int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb); int dev_forward_skb(struct net_device *dev, struct sk_buff *skb); int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb); bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb); static __always_inline bool __is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb, const bool check_mtu) { const u32 vlan_hdr_len = 4; /* VLAN_HLEN */ unsigned int len; if (!(dev->flags & IFF_UP)) return false; if (!check_mtu) return true; len = dev->mtu + dev->hard_header_len + vlan_hdr_len; if (skb->len <= len) return true; /* if TSO is enabled, we don't care about the length as the packet * could be forwarded without being segmented before */ if (skb_is_gso(skb)) return true; return false; } void netdev_core_stats_inc(struct net_device *dev, u32 offset); #define DEV_CORE_STATS_INC(FIELD) \ static inline void dev_core_stats_##FIELD##_inc(struct net_device *dev) \ { \ netdev_core_stats_inc(dev, \ offsetof(struct net_device_core_stats, FIELD)); \ } DEV_CORE_STATS_INC(rx_dropped) DEV_CORE_STATS_INC(tx_dropped) DEV_CORE_STATS_INC(rx_nohandler) DEV_CORE_STATS_INC(rx_otherhost_dropped) #undef DEV_CORE_STATS_INC static __always_inline int ____dev_forward_skb(struct net_device *dev, struct sk_buff *skb, const bool check_mtu) { if (skb_orphan_frags(skb, GFP_ATOMIC) || unlikely(!__is_skb_forwardable(dev, skb, check_mtu))) { dev_core_stats_rx_dropped_inc(dev); kfree_skb(skb); return NET_RX_DROP; } skb_scrub_packet(skb, !net_eq(dev_net(dev), dev_net(skb->dev))); skb->priority = 0; return 0; } bool dev_nit_active_rcu(const struct net_device *dev); static inline bool dev_nit_active(const struct net_device *dev) { bool ret; rcu_read_lock(); ret = dev_nit_active_rcu(dev); rcu_read_unlock(); return ret; } void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev); void save_netdev_trace_buffer(struct net_device *dev, int delta); static inline void __dev_put(struct net_device *dev) { if (dev) { #ifdef CONFIG_NET_DEV_REFCNT_TRACKER save_netdev_trace_buffer(dev, -1); #endif #ifdef CONFIG_PCPU_DEV_REFCNT this_cpu_dec(*dev->pcpu_refcnt); #else refcount_dec(&dev->dev_refcnt); #endif } } static inline void __dev_hold(struct net_device *dev) { if (dev) { #ifdef CONFIG_NET_DEV_REFCNT_TRACKER save_netdev_trace_buffer(dev, 1); #endif #ifdef CONFIG_PCPU_DEV_REFCNT this_cpu_inc(*dev->pcpu_refcnt); #else refcount_inc(&dev->dev_refcnt); #endif } } static inline void __netdev_tracker_alloc(struct net_device *dev, netdevice_tracker *tracker, gfp_t gfp) { #ifdef CONFIG_NET_DEV_REFCNT_TRACKER ref_tracker_alloc(&dev->refcnt_tracker, tracker, gfp); #endif } /* netdev_tracker_alloc() can upgrade a prior untracked reference * taken by dev_get_by_name()/dev_get_by_index() to a tracked one. */ static inline void netdev_tracker_alloc(struct net_device *dev, netdevice_tracker *tracker, gfp_t gfp) { #ifdef CONFIG_NET_DEV_REFCNT_TRACKER refcount_dec(&dev->refcnt_tracker.no_tracker); __netdev_tracker_alloc(dev, tracker, gfp); #endif } static inline void netdev_tracker_free(struct net_device *dev, netdevice_tracker *tracker) { #ifdef CONFIG_NET_DEV_REFCNT_TRACKER ref_tracker_free(&dev->refcnt_tracker, tracker); #endif } static inline void netdev_hold(struct net_device *dev, netdevice_tracker *tracker, gfp_t gfp) { if (dev) { __dev_hold(dev); __netdev_tracker_alloc(dev, tracker, gfp); } } static inline void netdev_put(struct net_device *dev, netdevice_tracker *tracker) { if (dev) { netdev_tracker_free(dev, tracker); __dev_put(dev); } } /** * dev_hold - get reference to device * @dev: network device * * Hold reference to device to keep it from being freed. * Try using netdev_hold() instead. */ static inline void dev_hold(struct net_device *dev) { netdev_hold(dev, NULL, GFP_ATOMIC); } /** * dev_put - release reference to device * @dev: network device * * Release reference to device to allow it to be freed. * Try using netdev_put() instead. */ static inline void dev_put(struct net_device *dev) { netdev_put(dev, NULL); } DEFINE_FREE(dev_put, struct net_device *, if (_T) dev_put(_T)) static inline void netdev_ref_replace(struct net_device *odev, struct net_device *ndev, netdevice_tracker *tracker, gfp_t gfp) { if (odev) netdev_tracker_free(odev, tracker); __dev_hold(ndev); __dev_put(odev); if (ndev) __netdev_tracker_alloc(ndev, tracker, gfp); } /* Carrier loss detection, dial on demand. The functions netif_carrier_on * and _off may be called from IRQ context, but it is caller * who is responsible for serialization of these calls. * * The name carrier is inappropriate, these functions should really be * called netif_lowerlayer_*() because they represent the state of any * kind of lower layer not just hardware media. */ void linkwatch_fire_event(struct net_device *dev); /** * linkwatch_sync_dev - sync linkwatch for the given device * @dev: network device to sync linkwatch for * * Sync linkwatch for the given device, removing it from the * pending work list (if queued). */ void linkwatch_sync_dev(struct net_device *dev); void __linkwatch_sync_dev(struct net_device *dev); /** * netif_carrier_ok - test if carrier present * @dev: network device * * Check if carrier is present on device */ static inline bool netif_carrier_ok(const struct net_device *dev) { return !test_bit(__LINK_STATE_NOCARRIER, &dev->state); } unsigned long dev_trans_start(struct net_device *dev); void netdev_watchdog_up(struct net_device *dev); void netif_carrier_on(struct net_device *dev); void netif_carrier_off(struct net_device *dev); void netif_carrier_event(struct net_device *dev); /** * netif_dormant_on - mark device as dormant. * @dev: network device * * Mark device as dormant (as per RFC2863). * * The dormant state indicates that the relevant interface is not * actually in a condition to pass packets (i.e., it is not 'up') but is * in a "pending" state, waiting for some external event. For "on- * demand" interfaces, this new state identifies the situation where the * interface is waiting for events to place it in the up state. */ static inline void netif_dormant_on(struct net_device *dev) { if (!test_and_set_bit(__LINK_STATE_DORMANT, &dev->state)) linkwatch_fire_event(dev); } /** * netif_dormant_off - set device as not dormant. * @dev: network device * * Device is not in dormant state. */ static inline void netif_dormant_off(struct net_device *dev) { if (test_and_clear_bit(__LINK_STATE_DORMANT, &dev->state)) linkwatch_fire_event(dev); } /** * netif_dormant - test if device is dormant * @dev: network device * * Check if device is dormant. */ static inline bool netif_dormant(const struct net_device *dev) { return test_bit(__LINK_STATE_DORMANT, &dev->state); } /** * netif_testing_on - mark device as under test. * @dev: network device * * Mark device as under test (as per RFC2863). * * The testing state indicates that some test(s) must be performed on * the interface. After completion, of the test, the interface state * will change to up, dormant, or down, as appropriate. */ static inline void netif_testing_on(struct net_device *dev) { if (!test_and_set_bit(__LINK_STATE_TESTING, &dev->state)) linkwatch_fire_event(dev); } /** * netif_testing_off - set device as not under test. * @dev: network device * * Device is not in testing state. */ static inline void netif_testing_off(struct net_device *dev) { if (test_and_clear_bit(__LINK_STATE_TESTING, &dev->state)) linkwatch_fire_event(dev); } /** * netif_testing - test if device is under test * @dev: network device * * Check if device is under test */ static inline bool netif_testing(const struct net_device *dev) { return test_bit(__LINK_STATE_TESTING, &dev->state); } /** * netif_oper_up - test if device is operational * @dev: network device * * Check if carrier is operational */ static inline bool netif_oper_up(const struct net_device *dev) { unsigned int operstate = READ_ONCE(dev->operstate); return operstate == IF_OPER_UP || operstate == IF_OPER_UNKNOWN /* backward compat */; } /** * netif_device_present - is device available or removed * @dev: network device * * Check if device has not been removed from system. */ static inline bool netif_device_present(const struct net_device *dev) { return test_bit(__LINK_STATE_PRESENT, &dev->state); } void netif_device_detach(struct net_device *dev); void netif_device_attach(struct net_device *dev); /* * Network interface message level settings */ enum { NETIF_MSG_DRV_BIT, NETIF_MSG_PROBE_BIT, NETIF_MSG_LINK_BIT, NETIF_MSG_TIMER_BIT, NETIF_MSG_IFDOWN_BIT, NETIF_MSG_IFUP_BIT, NETIF_MSG_RX_ERR_BIT, NETIF_MSG_TX_ERR_BIT, NETIF_MSG_TX_QUEUED_BIT, NETIF_MSG_INTR_BIT, NETIF_MSG_TX_DONE_BIT, NETIF_MSG_RX_STATUS_BIT, NETIF_MSG_PKTDATA_BIT, NETIF_MSG_HW_BIT, NETIF_MSG_WOL_BIT, /* When you add a new bit above, update netif_msg_class_names array * in net/ethtool/common.c */ NETIF_MSG_CLASS_COUNT, }; /* Both ethtool_ops interface and internal driver implementation use u32 */ static_assert(NETIF_MSG_CLASS_COUNT <= 32); #define __NETIF_MSG_BIT(bit) ((u32)1 << (bit)) #define __NETIF_MSG(name) __NETIF_MSG_BIT(NETIF_MSG_ ## name ## _BIT) #define NETIF_MSG_DRV __NETIF_MSG(DRV) #define NETIF_MSG_PROBE __NETIF_MSG(PROBE) #define NETIF_MSG_LINK __NETIF_MSG(LINK) #define NETIF_MSG_TIMER __NETIF_MSG(TIMER) #define NETIF_MSG_IFDOWN __NETIF_MSG(IFDOWN) #define NETIF_MSG_IFUP __NETIF_MSG(IFUP) #define NETIF_MSG_RX_ERR __NETIF_MSG(RX_ERR) #define NETIF_MSG_TX_ERR __NETIF_MSG(TX_ERR) #define NETIF_MSG_TX_QUEUED __NETIF_MSG(TX_QUEUED) #define NETIF_MSG_INTR __NETIF_MSG(INTR) #define NETIF_MSG_TX_DONE __NETIF_MSG(TX_DONE) #define NETIF_MSG_RX_STATUS __NETIF_MSG(RX_STATUS) #define NETIF_MSG_PKTDATA __NETIF_MSG(PKTDATA) #define NETIF_MSG_HW __NETIF_MSG(HW) #define NETIF_MSG_WOL __NETIF_MSG(WOL) #define netif_msg_drv(p) ((p)->msg_enable & NETIF_MSG_DRV) #define netif_msg_probe(p) ((p)->msg_enable & NETIF_MSG_PROBE) #define netif_msg_link(p) ((p)->msg_enable & NETIF_MSG_LINK) #define netif_msg_timer(p) ((p)->msg_enable & NETIF_MSG_TIMER) #define netif_msg_ifdown(p) ((p)->msg_enable & NETIF_MSG_IFDOWN) #define netif_msg_ifup(p) ((p)->msg_enable & NETIF_MSG_IFUP) #define netif_msg_rx_err(p) ((p)->msg_enable & NETIF_MSG_RX_ERR) #define netif_msg_tx_err(p) ((p)->msg_enable & NETIF_MSG_TX_ERR) #define netif_msg_tx_queued(p) ((p)->msg_enable & NETIF_MSG_TX_QUEUED) #define netif_msg_intr(p) ((p)->msg_enable & NETIF_MSG_INTR) #define netif_msg_tx_done(p) ((p)->msg_enable & NETIF_MSG_TX_DONE) #define netif_msg_rx_status(p) ((p)->msg_enable & NETIF_MSG_RX_STATUS) #define netif_msg_pktdata(p) ((p)->msg_enable & NETIF_MSG_PKTDATA) #define netif_msg_hw(p) ((p)->msg_enable & NETIF_MSG_HW) #define netif_msg_wol(p) ((p)->msg_enable & NETIF_MSG_WOL) static inline u32 netif_msg_init(int debug_value, int default_msg_enable_bits) { /* use default */ if (debug_value < 0 || debug_value >= (sizeof(u32) * 8)) return default_msg_enable_bits; if (debug_value == 0) /* no output */ return 0; /* set low N bits */ return (1U << debug_value) - 1; } static inline void __netif_tx_lock(struct netdev_queue *txq, int cpu) { spin_lock(&txq->_xmit_lock); /* Pairs with READ_ONCE() in __dev_queue_xmit() */ WRITE_ONCE(txq->xmit_lock_owner, cpu); } static inline bool __netif_tx_acquire(struct netdev_queue *txq) { __acquire(&txq->_xmit_lock); return true; } static inline void __netif_tx_release(struct netdev_queue *txq) { __release(&txq->_xmit_lock); } static inline void __netif_tx_lock_bh(struct netdev_queue *txq) { spin_lock_bh(&txq->_xmit_lock); /* Pairs with READ_ONCE() in __dev_queue_xmit() */ WRITE_ONCE(txq->xmit_lock_owner, smp_processor_id()); } static inline bool __netif_tx_trylock(struct netdev_queue *txq) { bool ok = spin_trylock(&txq->_xmit_lock); if (likely(ok)) { /* Pairs with READ_ONCE() in __dev_queue_xmit() */ WRITE_ONCE(txq->xmit_lock_owner, smp_processor_id()); } return ok; } static inline void __netif_tx_unlock(struct netdev_queue *txq) { /* Pairs with READ_ONCE() in __dev_queue_xmit() */ WRITE_ONCE(txq->xmit_lock_owner, -1); spin_unlock(&txq->_xmit_lock); } static inline void __netif_tx_unlock_bh(struct netdev_queue *txq) { /* Pairs with READ_ONCE() in __dev_queue_xmit() */ WRITE_ONCE(txq->xmit_lock_owner, -1); spin_unlock_bh(&txq->_xmit_lock); } /* * txq->trans_start can be read locklessly from dev_watchdog() */ static inline void txq_trans_update(const struct net_device *dev, struct netdev_queue *txq) { if (!dev->lltx) WRITE_ONCE(txq->trans_start, jiffies); } static inline void txq_trans_cond_update(struct netdev_queue *txq) { unsigned long now = jiffies; if (READ_ONCE(txq->trans_start) != now) WRITE_ONCE(txq->trans_start, now); } /* legacy drivers only, netdev_start_xmit() sets txq->trans_start */ static inline void netif_trans_update(struct net_device *dev) { struct netdev_queue *txq = netdev_get_tx_queue(dev, 0); txq_trans_cond_update(txq); } /** * netif_tx_lock - grab network device transmit lock * @dev: network device * * Get network device transmit lock */ void netif_tx_lock(struct net_device *dev); static inline void netif_tx_lock_bh(struct net_device *dev) { local_bh_disable(); netif_tx_lock(dev); } void netif_tx_unlock(struct net_device *dev); static inline void netif_tx_unlock_bh(struct net_device *dev) { netif_tx_unlock(dev); local_bh_enable(); } #define HARD_TX_LOCK(dev, txq, cpu) { \ if (!(dev)->lltx) { \ __netif_tx_lock(txq, cpu); \ } else { \ __netif_tx_acquire(txq); \ } \ } #define HARD_TX_TRYLOCK(dev, txq) \ (!(dev)->lltx ? \ __netif_tx_trylock(txq) : \ __netif_tx_acquire(txq)) #define HARD_TX_UNLOCK(dev, txq) { \ if (!(dev)->lltx) { \ __netif_tx_unlock(txq); \ } else { \ __netif_tx_release(txq); \ } \ } static inline void netif_tx_disable(struct net_device *dev) { unsigned int i; int cpu; local_bh_disable(); cpu = smp_processor_id(); spin_lock(&dev->tx_global_lock); for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); __netif_tx_lock(txq, cpu); netif_tx_stop_queue(txq); __netif_tx_unlock(txq); } spin_unlock(&dev->tx_global_lock); local_bh_enable(); } static inline void netif_addr_lock(struct net_device *dev) { unsigned char nest_level = 0; #ifdef CONFIG_LOCKDEP nest_level = dev->nested_level; #endif spin_lock_nested(&dev->addr_list_lock, nest_level); } static inline void netif_addr_lock_bh(struct net_device *dev) { unsigned char nest_level = 0; #ifdef CONFIG_LOCKDEP nest_level = dev->nested_level; #endif local_bh_disable(); spin_lock_nested(&dev->addr_list_lock, nest_level); } static inline void netif_addr_unlock(struct net_device *dev) { spin_unlock(&dev->addr_list_lock); } static inline void netif_addr_unlock_bh(struct net_device *dev) { spin_unlock_bh(&dev->addr_list_lock); } /* * dev_addrs walker. Should be used only for read access. Call with * rcu_read_lock held. */ #define for_each_dev_addr(dev, ha) \ list_for_each_entry_rcu(ha, &dev->dev_addrs.list, list) /* These functions live elsewhere (drivers/net/net_init.c, but related) */ void ether_setup(struct net_device *dev); /* Allocate dummy net_device */ struct net_device *alloc_netdev_dummy(int sizeof_priv); /* Support for loadable net-drivers */ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, unsigned char name_assign_type, void (*setup)(struct net_device *), unsigned int txqs, unsigned int rxqs); #define alloc_netdev(sizeof_priv, name, name_assign_type, setup) \ alloc_netdev_mqs(sizeof_priv, name, name_assign_type, setup, 1, 1) #define alloc_netdev_mq(sizeof_priv, name, name_assign_type, setup, count) \ alloc_netdev_mqs(sizeof_priv, name, name_assign_type, setup, count, \ count) int register_netdev(struct net_device *dev); void unregister_netdev(struct net_device *dev); int devm_register_netdev(struct device *dev, struct net_device *ndev); /* General hardware address lists handling functions */ int __hw_addr_sync(struct netdev_hw_addr_list *to_list, struct netdev_hw_addr_list *from_list, int addr_len); int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list, struct netdev_hw_addr_list *from_list, int addr_len); void __hw_addr_unsync(struct netdev_hw_addr_list *to_list, struct netdev_hw_addr_list *from_list, int addr_len); int __hw_addr_sync_dev(struct netdev_hw_addr_list *list, struct net_device *dev, int (*sync)(struct net_device *, const unsigned char *), int (*unsync)(struct net_device *, const unsigned char *)); int __hw_addr_ref_sync_dev(struct netdev_hw_addr_list *list, struct net_device *dev, int (*sync)(struct net_device *, const unsigned char *, int), int (*unsync)(struct net_device *, const unsigned char *, int)); void __hw_addr_ref_unsync_dev(struct netdev_hw_addr_list *list, struct net_device *dev, int (*unsync)(struct net_device *, const unsigned char *, int)); void __hw_addr_unsync_dev(struct netdev_hw_addr_list *list, struct net_device *dev, int (*unsync)(struct net_device *, const unsigned char *)); void __hw_addr_init(struct netdev_hw_addr_list *list); /* Functions used for device addresses handling */ void dev_addr_mod(struct net_device *dev, unsigned int offset, const void *addr, size_t len); static inline void __dev_addr_set(struct net_device *dev, const void *addr, size_t len) { dev_addr_mod(dev, 0, addr, len); } static inline void dev_addr_set(struct net_device *dev, const u8 *addr) { __dev_addr_set(dev, addr, dev->addr_len); } int dev_addr_add(struct net_device *dev, const unsigned char *addr, unsigned char addr_type); int dev_addr_del(struct net_device *dev, const unsigned char *addr, unsigned char addr_type); /* Functions used for unicast addresses handling */ int dev_uc_add(struct net_device *dev, const unsigned char *addr); int dev_uc_add_excl(struct net_device *dev, const unsigned char *addr); int dev_uc_del(struct net_device *dev, const unsigned char *addr); int dev_uc_sync(struct net_device *to, struct net_device *from); int dev_uc_sync_multiple(struct net_device *to, struct net_device *from); void dev_uc_unsync(struct net_device *to, struct net_device *from); void dev_uc_flush(struct net_device *dev); void dev_uc_init(struct net_device *dev); /** * __dev_uc_sync - Synchronize device's unicast list * @dev: device to sync * @sync: function to call if address should be added * @unsync: function to call if address should be removed * * Add newly added addresses to the interface, and release * addresses that have been deleted. */ static inline int __dev_uc_sync(struct net_device *dev, int (*sync)(struct net_device *, const unsigned char *), int (*unsync)(struct net_device *, const unsigned char *)) { return __hw_addr_sync_dev(&dev->uc, dev, sync, unsync); } /** * __dev_uc_unsync - Remove synchronized addresses from device * @dev: device to sync * @unsync: function to call if address should be removed * * Remove all addresses that were added to the device by dev_uc_sync(). */ static inline void __dev_uc_unsync(struct net_device *dev, int (*unsync)(struct net_device *, const unsigned char *)) { __hw_addr_unsync_dev(&dev->uc, dev, unsync); } /* Functions used for multicast addresses handling */ int dev_mc_add(struct net_device *dev, const unsigned char *addr); int dev_mc_add_global(struct net_device *dev, const unsigned char *addr); int dev_mc_add_excl(struct net_device *dev, const unsigned char *addr); int dev_mc_del(struct net_device *dev, const unsigned char *addr); int dev_mc_del_global(struct net_device *dev, const unsigned char *addr); int dev_mc_sync(struct net_device *to, struct net_device *from); int dev_mc_sync_multiple(struct net_device *to, struct net_device *from); void dev_mc_unsync(struct net_device *to, struct net_device *from); void dev_mc_flush(struct net_device *dev); void dev_mc_init(struct net_device *dev); /** * __dev_mc_sync - Synchronize device's multicast list * @dev: device to sync * @sync: function to call if address should be added * @unsync: function to call if address should be removed * * Add newly added addresses to the interface, and release * addresses that have been deleted. */ static inline int __dev_mc_sync(struct net_device *dev, int (*sync)(struct net_device *, const unsigned char *), int (*unsync)(struct net_device *, const unsigned char *)) { return __hw_addr_sync_dev(&dev->mc, dev, sync, unsync); } /** * __dev_mc_unsync - Remove synchronized addresses from device * @dev: device to sync * @unsync: function to call if address should be removed * * Remove all addresses that were added to the device by dev_mc_sync(). */ static inline void __dev_mc_unsync(struct net_device *dev, int (*unsync)(struct net_device *, const unsigned char *)) { __hw_addr_unsync_dev(&dev->mc, dev, unsync); } /* Functions used for secondary unicast and multicast support */ void dev_set_rx_mode(struct net_device *dev); int netif_set_promiscuity(struct net_device *dev, int inc); int dev_set_promiscuity(struct net_device *dev, int inc); int netif_set_allmulti(struct net_device *dev, int inc, bool notify); int dev_set_allmulti(struct net_device *dev, int inc); void netif_state_change(struct net_device *dev); void netdev_state_change(struct net_device *dev); void __netdev_notify_peers(struct net_device *dev); void netdev_notify_peers(struct net_device *dev); void netdev_features_change(struct net_device *dev); /* Load a device via the kmod */ void dev_load(struct net *net, const char *name); struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, struct rtnl_link_stats64 *storage); void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, const struct net_device_stats *netdev_stats); void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s, const struct pcpu_sw_netstats __percpu *netstats); void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s); enum { NESTED_SYNC_IMM_BIT, NESTED_SYNC_TODO_BIT, }; #define __NESTED_SYNC_BIT(bit) ((u32)1 << (bit)) #define __NESTED_SYNC(name) __NESTED_SYNC_BIT(NESTED_SYNC_ ## name ## _BIT) #define NESTED_SYNC_IMM __NESTED_SYNC(IMM) #define NESTED_SYNC_TODO __NESTED_SYNC(TODO) struct netdev_nested_priv { unsigned char flags; void *data; }; bool netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev); struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, struct list_head **iter); /* iterate through upper list, must be called under RCU read lock */ #define netdev_for_each_upper_dev_rcu(dev, updev, iter) \ for (iter = &(dev)->adj_list.upper, \ updev = netdev_upper_get_next_dev_rcu(dev, &(iter)); \ updev; \ updev = netdev_upper_get_next_dev_rcu(dev, &(iter))) int netdev_walk_all_upper_dev_rcu(struct net_device *dev, int (*fn)(struct net_device *upper_dev, struct netdev_nested_priv *priv), struct netdev_nested_priv *priv); bool netdev_has_upper_dev_all_rcu(struct net_device *dev, struct net_device *upper_dev); bool netdev_has_any_upper_dev(struct net_device *dev); void *netdev_lower_get_next_private(struct net_device *dev, struct list_head **iter); void *netdev_lower_get_next_private_rcu(struct net_device *dev, struct list_head **iter); #define netdev_for_each_lower_private(dev, priv, iter) \ for (iter = (dev)->adj_list.lower.next, \ priv = netdev_lower_get_next_private(dev, &(iter)); \ priv; \ priv = netdev_lower_get_next_private(dev, &(iter))) #define netdev_for_each_lower_private_rcu(dev, priv, iter) \ for (iter = &(dev)->adj_list.lower, \ priv = netdev_lower_get_next_private_rcu(dev, &(iter)); \ priv; \ priv = netdev_lower_get_next_private_rcu(dev, &(iter))) void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter); #define netdev_for_each_lower_dev(dev, ldev, iter) \ for (iter = (dev)->adj_list.lower.next, \ ldev = netdev_lower_get_next(dev, &(iter)); \ ldev; \ ldev = netdev_lower_get_next(dev, &(iter))) struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev, struct list_head **iter); int netdev_walk_all_lower_dev(struct net_device *dev, int (*fn)(struct net_device *lower_dev, struct netdev_nested_priv *priv), struct netdev_nested_priv *priv); int netdev_walk_all_lower_dev_rcu(struct net_device *dev, int (*fn)(struct net_device *lower_dev, struct netdev_nested_priv *priv), struct netdev_nested_priv *priv); void *netdev_adjacent_get_private(struct list_head *adj_list); void *netdev_lower_get_first_private_rcu(struct net_device *dev); struct net_device *netdev_master_upper_dev_get(struct net_device *dev); struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev); int netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, struct netlink_ext_ack *extack); int netdev_master_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, void *upper_priv, void *upper_info, struct netlink_ext_ack *extack); void netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev); int netdev_adjacent_change_prepare(struct net_device *old_dev, struct net_device *new_dev, struct net_device *dev, struct netlink_ext_ack *extack); void netdev_adjacent_change_commit(struct net_device *old_dev, struct net_device *new_dev, struct net_device *dev); void netdev_adjacent_change_abort(struct net_device *old_dev, struct net_device *new_dev, struct net_device *dev); void netdev_adjacent_rename_links(struct net_device *dev, char *oldname); void *netdev_lower_dev_get_private(struct net_device *dev, struct net_device *lower_dev); void netdev_lower_state_changed(struct net_device *lower_dev, void *lower_state_info); /* RSS keys are 40 or 52 bytes long */ #define NETDEV_RSS_KEY_LEN 52 extern u8 netdev_rss_key[NETDEV_RSS_KEY_LEN] __read_mostly; void netdev_rss_key_fill(void *buffer, size_t len); int skb_checksum_help(struct sk_buff *skb); int skb_crc32c_csum_help(struct sk_buff *skb); int skb_csum_hwoffload_help(struct sk_buff *skb, const netdev_features_t features); struct netdev_bonding_info { ifslave slave; ifbond master; }; struct netdev_notifier_bonding_info { struct netdev_notifier_info info; /* must be first */ struct netdev_bonding_info bonding_info; }; void netdev_bonding_info_change(struct net_device *dev, struct netdev_bonding_info *bonding_info); #if IS_ENABLED(CONFIG_ETHTOOL_NETLINK) void ethtool_notify(struct net_device *dev, unsigned int cmd); #else static inline void ethtool_notify(struct net_device *dev, unsigned int cmd) { } #endif __be16 skb_network_protocol(struct sk_buff *skb, int *depth); static inline bool can_checksum_protocol(netdev_features_t features, __be16 protocol) { if (protocol == htons(ETH_P_FCOE)) return !!(features & NETIF_F_FCOE_CRC); /* Assume this is an IP checksum (not SCTP CRC) */ if (features & NETIF_F_HW_CSUM) { /* Can checksum everything */ return true; } switch (protocol) { case htons(ETH_P_IP): return !!(features & NETIF_F_IP_CSUM); case htons(ETH_P_IPV6): return !!(features & NETIF_F_IPV6_CSUM); default: return false; } } #ifdef CONFIG_BUG void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb); #else static inline void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb) { } #endif /* rx skb timestamps */ void net_enable_timestamp(void); void net_disable_timestamp(void); static inline ktime_t netdev_get_tstamp(struct net_device *dev, const struct skb_shared_hwtstamps *hwtstamps, bool cycles) { const struct net_device_ops *ops = dev->netdev_ops; if (ops->ndo_get_tstamp) return ops->ndo_get_tstamp(dev, hwtstamps, cycles); return hwtstamps->hwtstamp; } #ifndef CONFIG_PREEMPT_RT static inline void netdev_xmit_set_more(bool more) { __this_cpu_write(softnet_data.xmit.more, more); } static inline bool netdev_xmit_more(void) { return __this_cpu_read(softnet_data.xmit.more); } #else static inline void netdev_xmit_set_more(bool more) { current->net_xmit.more = more; } static inline bool netdev_xmit_more(void) { return current->net_xmit.more; } #endif static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops, struct sk_buff *skb, struct net_device *dev, bool more) { netdev_xmit_set_more(more); return ops->ndo_start_xmit(skb, dev); } static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq, bool more) { const struct net_device_ops *ops = dev->netdev_ops; netdev_tx_t rc; rc = __netdev_start_xmit(ops, skb, dev, more); if (rc == NETDEV_TX_OK) txq_trans_update(dev, txq); return rc; } int netdev_class_create_file_ns(const struct class_attribute *class_attr, const void *ns); void netdev_class_remove_file_ns(const struct class_attribute *class_attr, const void *ns); extern const struct kobj_ns_type_operations net_ns_type_operations; const char *netdev_drivername(const struct net_device *dev); static inline netdev_features_t netdev_intersect_features(netdev_features_t f1, netdev_features_t f2) { if ((f1 ^ f2) & NETIF_F_HW_CSUM) { if (f1 & NETIF_F_HW_CSUM) f1 |= (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM); else f2 |= (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM); } return f1 & f2; } static inline netdev_features_t netdev_get_wanted_features( struct net_device *dev) { return (dev->features & ~dev->hw_features) | dev->wanted_features; } netdev_features_t netdev_increment_features(netdev_features_t all, netdev_features_t one, netdev_features_t mask); /* Allow TSO being used on stacked device : * Performing the GSO segmentation before last device * is a performance improvement. */ static inline netdev_features_t netdev_add_tso_features(netdev_features_t features, netdev_features_t mask) { return netdev_increment_features(features, NETIF_F_ALL_TSO, mask); } int __netdev_update_features(struct net_device *dev); void netdev_update_features(struct net_device *dev); void netdev_change_features(struct net_device *dev); void netif_stacked_transfer_operstate(const struct net_device *rootdev, struct net_device *dev); netdev_features_t passthru_features_check(struct sk_buff *skb, struct net_device *dev, netdev_features_t features); netdev_features_t netif_skb_features(struct sk_buff *skb); void skb_warn_bad_offload(const struct sk_buff *skb); static inline bool net_gso_ok(netdev_features_t features, int gso_type) { netdev_features_t feature; if (gso_type & (SKB_GSO_TCP_FIXEDID | SKB_GSO_TCP_FIXEDID_INNER)) gso_type |= __SKB_GSO_TCP_FIXEDID; feature = ((netdev_features_t)gso_type << NETIF_F_GSO_SHIFT) & NETIF_F_GSO_MASK; /* check flags correspondence */ BUILD_BUG_ON(SKB_GSO_TCPV4 != (NETIF_F_TSO >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_DODGY != (NETIF_F_GSO_ROBUST >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_TCP_ECN != (NETIF_F_TSO_ECN >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(__SKB_GSO_TCP_FIXEDID != (NETIF_F_TSO_MANGLEID >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_TCPV6 != (NETIF_F_TSO6 >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_FCOE != (NETIF_F_FSO >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_GRE != (NETIF_F_GSO_GRE >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_GRE_CSUM != (NETIF_F_GSO_GRE_CSUM >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_IPXIP4 != (NETIF_F_GSO_IPXIP4 >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_IPXIP6 != (NETIF_F_GSO_IPXIP6 >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL != (NETIF_F_GSO_UDP_TUNNEL >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL_CSUM != (NETIF_F_GSO_UDP_TUNNEL_CSUM >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_PARTIAL != (NETIF_F_GSO_PARTIAL >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_TUNNEL_REMCSUM != (NETIF_F_GSO_TUNNEL_REMCSUM >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_SCTP != (NETIF_F_GSO_SCTP >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_ESP != (NETIF_F_GSO_ESP >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_UDP != (NETIF_F_GSO_UDP >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_UDP_L4 != (NETIF_F_GSO_UDP_L4 >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_FRAGLIST != (NETIF_F_GSO_FRAGLIST >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_TCP_ACCECN != (NETIF_F_GSO_ACCECN >> NETIF_F_GSO_SHIFT)); return (features & feature) == feature; } static inline bool skb_gso_ok(struct sk_buff *skb, netdev_features_t features) { return net_gso_ok(features, skb_shinfo(skb)->gso_type) && (!skb_has_frag_list(skb) || (features & NETIF_F_FRAGLIST)); } static inline bool netif_needs_gso(struct sk_buff *skb, netdev_features_t features) { return skb_is_gso(skb) && (!skb_gso_ok(skb, features) || unlikely((skb->ip_summed != CHECKSUM_PARTIAL) && (skb->ip_summed != CHECKSUM_UNNECESSARY))); } void netif_set_tso_max_size(struct net_device *dev, unsigned int size); void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs); void netif_inherit_tso_max(struct net_device *to, const struct net_device *from); static inline unsigned int netif_get_gro_max_size(const struct net_device *dev, const struct sk_buff *skb) { /* pairs with WRITE_ONCE() in netif_set_gro(_ipv4)_max_size() */ return skb->protocol == htons(ETH_P_IPV6) ? READ_ONCE(dev->gro_max_size) : READ_ONCE(dev->gro_ipv4_max_size); } static inline unsigned int netif_get_gso_max_size(const struct net_device *dev, const struct sk_buff *skb) { /* pairs with WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */ return skb->protocol == htons(ETH_P_IPV6) ? READ_ONCE(dev->gso_max_size) : READ_ONCE(dev->gso_ipv4_max_size); } static inline bool netif_is_macsec(const struct net_device *dev) { return dev->priv_flags & IFF_MACSEC; } static inline bool netif_is_macvlan(const struct net_device *dev) { return dev->priv_flags & IFF_MACVLAN; } static inline bool netif_is_macvlan_port(const struct net_device *dev) { return dev->priv_flags & IFF_MACVLAN_PORT; } static inline bool netif_is_bond_master(const struct net_device *dev) { return dev->flags & IFF_MASTER && dev->priv_flags & IFF_BONDING; } static inline bool netif_is_bond_slave(const struct net_device *dev) { return dev->flags & IFF_SLAVE && dev->priv_flags & IFF_BONDING; } static inline bool netif_supports_nofcs(struct net_device *dev) { return dev->priv_flags & IFF_SUPP_NOFCS; } static inline bool netif_has_l3_rx_handler(const struct net_device *dev) { return dev->priv_flags & IFF_L3MDEV_RX_HANDLER; } static inline bool netif_is_l3_master(const struct net_device *dev) { return dev->priv_flags & IFF_L3MDEV_MASTER; } static inline bool netif_is_l3_slave(const struct net_device *dev) { return dev->priv_flags & IFF_L3MDEV_SLAVE; } static inline int dev_sdif(const struct net_device *dev) { #ifdef CONFIG_NET_L3_MASTER_DEV if (netif_is_l3_slave(dev)) return dev->ifindex; #endif return 0; } static inline bool netif_is_bridge_master(const struct net_device *dev) { return dev->priv_flags & IFF_EBRIDGE; } static inline bool netif_is_bridge_port(const struct net_device *dev) { return dev->priv_flags & IFF_BRIDGE_PORT; } static inline bool netif_is_ovs_master(const struct net_device *dev) { return dev->priv_flags & IFF_OPENVSWITCH; } static inline bool netif_is_ovs_port(const struct net_device *dev) { return dev->priv_flags & IFF_OVS_DATAPATH; } static inline bool netif_is_any_bridge_master(const struct net_device *dev) { return netif_is_bridge_master(dev) || netif_is_ovs_master(dev); } static inline bool netif_is_any_bridge_port(const struct net_device *dev) { return netif_is_bridge_port(dev) || netif_is_ovs_port(dev); } static inline bool netif_is_team_master(const struct net_device *dev) { return dev->priv_flags & IFF_TEAM; } static inline bool netif_is_team_port(const struct net_device *dev) { return dev->priv_flags & IFF_TEAM_PORT; } static inline bool netif_is_lag_master(const struct net_device *dev) { return netif_is_bond_master(dev) || netif_is_team_master(dev); } static inline bool netif_is_lag_port(const struct net_device *dev) { return netif_is_bond_slave(dev) || netif_is_team_port(dev); } static inline bool netif_is_rxfh_configured(const struct net_device *dev) { return dev->priv_flags & IFF_RXFH_CONFIGURED; } static inline bool netif_is_failover(const struct net_device *dev) { return dev->priv_flags & IFF_FAILOVER; } static inline bool netif_is_failover_slave(const struct net_device *dev) { return dev->priv_flags & IFF_FAILOVER_SLAVE; } /* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */ static inline void netif_keep_dst(struct net_device *dev) { dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM); } /* return true if dev can't cope with mtu frames that need vlan tag insertion */ static inline bool netif_reduces_vlan_mtu(struct net_device *dev) { /* TODO: reserve and use an additional IFF bit, if we get more users */ return netif_is_macsec(dev); } extern struct pernet_operations __net_initdata loopback_net_ops; /* Logging, debugging and troubleshooting/diagnostic helpers. */ /* netdev_printk helpers, similar to dev_printk */ static inline const char *netdev_name(const struct net_device *dev) { if (!dev->name[0] || strchr(dev->name, '%')) return "(unnamed net_device)"; return dev->name; } static inline const char *netdev_reg_state(const struct net_device *dev) { u8 reg_state = READ_ONCE(dev->reg_state); switch (reg_state) { case NETREG_UNINITIALIZED: return " (uninitialized)"; case NETREG_REGISTERED: return ""; case NETREG_UNREGISTERING: return " (unregistering)"; case NETREG_UNREGISTERED: return " (unregistered)"; case NETREG_RELEASED: return " (released)"; case NETREG_DUMMY: return " (dummy)"; } WARN_ONCE(1, "%s: unknown reg_state %d\n", dev->name, reg_state); return " (unknown)"; } #define MODULE_ALIAS_NETDEV(device) \ MODULE_ALIAS("netdev-" device) /* * netdev_WARN() acts like dev_printk(), but with the key difference * of using a WARN/WARN_ON to get the message out, including the * file/line information and a backtrace. */ #define netdev_WARN(dev, format, args...) \ WARN(1, "netdevice: %s%s: " format, netdev_name(dev), \ netdev_reg_state(dev), ##args) #define netdev_WARN_ONCE(dev, format, args...) \ WARN_ONCE(1, "netdevice: %s%s: " format, netdev_name(dev), \ netdev_reg_state(dev), ##args) /* * The list of packet types we will receive (as opposed to discard) * and the routines to invoke. * * Why 16. Because with 16 the only overlap we get on a hash of the * low nibble of the protocol value is RARP/SNAP/X.25. * * 0800 IP * 0001 802.3 * 0002 AX.25 * 0004 802.2 * 8035 RARP * 0005 SNAP * 0805 X.25 * 0806 ARP * 8137 IPX * 0009 Localtalk * 86DD IPv6 */ #define PTYPE_HASH_SIZE (16) #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1) extern struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; extern struct net_device *blackhole_netdev; /* Note: Avoid these macros in fast path, prefer per-cpu or per-queue counters. */ #define DEV_STATS_INC(DEV, FIELD) atomic_long_inc(&(DEV)->stats.__##FIELD) #define DEV_STATS_ADD(DEV, FIELD, VAL) \ atomic_long_add((VAL), &(DEV)->stats.__##FIELD) #define DEV_STATS_READ(DEV, FIELD) atomic_long_read(&(DEV)->stats.__##FIELD) #endif /* _LINUX_NETDEVICE_H */
19 18 1 13 5 11 9 2 8 2 2 11 9 2 11 11 10 10 10 10 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 // SPDX-License-Identifier: GPL-2.0-only #include "netlink.h" #include "common.h" struct pause_req_info { struct ethnl_req_info base; enum ethtool_mac_stats_src src; }; #define PAUSE_REQINFO(__req_base) \ container_of(__req_base, struct pause_req_info, base) struct pause_reply_data { struct ethnl_reply_data base; struct ethtool_pauseparam pauseparam; struct ethtool_pause_stats pausestat; }; #define PAUSE_REPDATA(__reply_base) \ container_of(__reply_base, struct pause_reply_data, base) const struct nla_policy ethnl_pause_get_policy[] = { [ETHTOOL_A_PAUSE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy_stats), [ETHTOOL_A_PAUSE_STATS_SRC] = NLA_POLICY_MAX(NLA_U32, ETHTOOL_MAC_STATS_SRC_PMAC), }; static int pause_parse_request(struct ethnl_req_info *req_base, struct nlattr **tb, struct netlink_ext_ack *extack) { enum ethtool_mac_stats_src src = ETHTOOL_MAC_STATS_SRC_AGGREGATE; struct pause_req_info *req_info = PAUSE_REQINFO(req_base); if (tb[ETHTOOL_A_PAUSE_STATS_SRC]) { if (!(req_base->flags & ETHTOOL_FLAG_STATS)) { NL_SET_ERR_MSG_MOD(extack, "ETHTOOL_FLAG_STATS must be set when requesting a source of stats"); return -EINVAL; } src = nla_get_u32(tb[ETHTOOL_A_PAUSE_STATS_SRC]); } req_info->src = src; return 0; } static int pause_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { const struct pause_req_info *req_info = PAUSE_REQINFO(req_base); struct pause_reply_data *data = PAUSE_REPDATA(reply_base); enum ethtool_mac_stats_src src = req_info->src; struct net_device *dev = reply_base->dev; int ret; if (!dev->ethtool_ops->get_pauseparam) return -EOPNOTSUPP; ethtool_stats_init((u64 *)&data->pausestat, sizeof(data->pausestat) / 8); data->pausestat.src = src; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; if ((src == ETHTOOL_MAC_STATS_SRC_EMAC || src == ETHTOOL_MAC_STATS_SRC_PMAC) && !__ethtool_dev_mm_supported(dev)) { NL_SET_ERR_MSG_MOD(info->extack, "Device does not support MAC merge layer"); ethnl_ops_complete(dev); return -EOPNOTSUPP; } dev->ethtool_ops->get_pauseparam(dev, &data->pauseparam); if (req_base->flags & ETHTOOL_FLAG_STATS && dev->ethtool_ops->get_pause_stats) dev->ethtool_ops->get_pause_stats(dev, &data->pausestat); ethnl_ops_complete(dev); return 0; } static int pause_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { int n = nla_total_size(sizeof(u8)) + /* _PAUSE_AUTONEG */ nla_total_size(sizeof(u8)) + /* _PAUSE_RX */ nla_total_size(sizeof(u8)); /* _PAUSE_TX */ if (req_base->flags & ETHTOOL_FLAG_STATS) n += nla_total_size(0) + /* _PAUSE_STATS */ nla_total_size(sizeof(u32)) + /* _PAUSE_STATS_SRC */ nla_total_size_64bit(sizeof(u64)) * ETHTOOL_PAUSE_STAT_CNT; return n; } static int ethtool_put_stat(struct sk_buff *skb, u64 val, u16 attrtype, u16 padtype) { if (val == ETHTOOL_STAT_NOT_SET) return 0; if (nla_put_u64_64bit(skb, attrtype, val, padtype)) return -EMSGSIZE; return 0; } static int pause_put_stats(struct sk_buff *skb, const struct ethtool_pause_stats *pause_stats) { const u16 pad = ETHTOOL_A_PAUSE_STAT_PAD; struct nlattr *nest; if (nla_put_u32(skb, ETHTOOL_A_PAUSE_STATS_SRC, pause_stats->src)) return -EMSGSIZE; nest = nla_nest_start(skb, ETHTOOL_A_PAUSE_STATS); if (!nest) return -EMSGSIZE; if (ethtool_put_stat(skb, pause_stats->tx_pause_frames, ETHTOOL_A_PAUSE_STAT_TX_FRAMES, pad) || ethtool_put_stat(skb, pause_stats->rx_pause_frames, ETHTOOL_A_PAUSE_STAT_RX_FRAMES, pad)) goto err_cancel; nla_nest_end(skb, nest); return 0; err_cancel: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int pause_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct pause_reply_data *data = PAUSE_REPDATA(reply_base); const struct ethtool_pauseparam *pauseparam = &data->pauseparam; if (nla_put_u8(skb, ETHTOOL_A_PAUSE_AUTONEG, !!pauseparam->autoneg) || nla_put_u8(skb, ETHTOOL_A_PAUSE_RX, !!pauseparam->rx_pause) || nla_put_u8(skb, ETHTOOL_A_PAUSE_TX, !!pauseparam->tx_pause)) return -EMSGSIZE; if (req_base->flags & ETHTOOL_FLAG_STATS && pause_put_stats(skb, &data->pausestat)) return -EMSGSIZE; return 0; } /* PAUSE_SET */ const struct nla_policy ethnl_pause_set_policy[] = { [ETHTOOL_A_PAUSE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_PAUSE_AUTONEG] = { .type = NLA_U8 }, [ETHTOOL_A_PAUSE_RX] = { .type = NLA_U8 }, [ETHTOOL_A_PAUSE_TX] = { .type = NLA_U8 }, [ETHTOOL_A_PAUSE_STATS_SRC] = { .type = NLA_REJECT }, }; static int ethnl_set_pause_validate(struct ethnl_req_info *req_info, struct genl_info *info) { const struct ethtool_ops *ops = req_info->dev->ethtool_ops; return ops->get_pauseparam && ops->set_pauseparam ? 1 : -EOPNOTSUPP; } static int ethnl_set_pause(struct ethnl_req_info *req_info, struct genl_info *info) { struct net_device *dev = req_info->dev; struct ethtool_pauseparam params = {}; struct nlattr **tb = info->attrs; bool mod = false; int ret; dev->ethtool_ops->get_pauseparam(dev, &params); ethnl_update_bool32(&params.autoneg, tb[ETHTOOL_A_PAUSE_AUTONEG], &mod); ethnl_update_bool32(&params.rx_pause, tb[ETHTOOL_A_PAUSE_RX], &mod); ethnl_update_bool32(&params.tx_pause, tb[ETHTOOL_A_PAUSE_TX], &mod); if (!mod) return 0; ret = dev->ethtool_ops->set_pauseparam(dev, &params); return ret < 0 ? ret : 1; } const struct ethnl_request_ops ethnl_pause_request_ops = { .request_cmd = ETHTOOL_MSG_PAUSE_GET, .reply_cmd = ETHTOOL_MSG_PAUSE_GET_REPLY, .hdr_attr = ETHTOOL_A_PAUSE_HEADER, .req_info_size = sizeof(struct pause_req_info), .reply_data_size = sizeof(struct pause_reply_data), .parse_request = pause_parse_request, .prepare_data = pause_prepare_data, .reply_size = pause_reply_size, .fill_reply = pause_fill_reply, .set_validate = ethnl_set_pause_validate, .set = ethnl_set_pause, .set_ntf_cmd = ETHTOOL_MSG_PAUSE_NTF, };
36 13 2 239 76 12 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_GENERIC_HUGETLB_H #define _ASM_GENERIC_HUGETLB_H #include <linux/swap.h> #include <linux/swapops.h> static inline unsigned long huge_pte_write(pte_t pte) { return pte_write(pte); } static inline unsigned long huge_pte_dirty(pte_t pte) { return pte_dirty(pte); } static inline pte_t huge_pte_mkwrite(pte_t pte) { return pte_mkwrite_novma(pte); } #ifndef __HAVE_ARCH_HUGE_PTE_WRPROTECT static inline pte_t huge_pte_wrprotect(pte_t pte) { return pte_wrprotect(pte); } #endif static inline pte_t huge_pte_mkdirty(pte_t pte) { return pte_mkdirty(pte); } static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot) { return pte_modify(pte, newprot); } #ifndef __HAVE_ARCH_HUGE_PTE_MKUFFD_WP static inline pte_t huge_pte_mkuffd_wp(pte_t pte) { return huge_pte_wrprotect(pte_mkuffd_wp(pte)); } #endif #ifndef __HAVE_ARCH_HUGE_PTE_CLEAR_UFFD_WP static inline pte_t huge_pte_clear_uffd_wp(pte_t pte) { return pte_clear_uffd_wp(pte); } #endif #ifndef __HAVE_ARCH_HUGE_PTE_UFFD_WP static inline int huge_pte_uffd_wp(pte_t pte) { return pte_uffd_wp(pte); } #endif #ifndef __HAVE_ARCH_HUGE_PTE_CLEAR static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long sz) { pte_clear(mm, addr, ptep); } #endif #ifndef __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned long sz) { set_pte_at(mm, addr, ptep, pte); } #endif #ifndef __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long sz) { return ptep_get_and_clear(mm, addr, ptep); } #endif #ifndef __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { return ptep_clear_flush(vma, addr, ptep); } #endif #ifndef __HAVE_ARCH_HUGE_PTE_NONE static inline int huge_pte_none(pte_t pte) { return pte_none(pte); } #endif /* Please refer to comments above pte_none_mostly() for the usage */ #ifndef __HAVE_ARCH_HUGE_PTE_NONE_MOSTLY static inline int huge_pte_none_mostly(pte_t pte) { return huge_pte_none(pte) || is_pte_marker(pte); } #endif #ifndef __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { ptep_set_wrprotect(mm, addr, ptep); } #endif #ifndef __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty) { return ptep_set_access_flags(vma, addr, ptep, pte, dirty); } #endif #ifndef __HAVE_ARCH_HUGE_PTEP_GET static inline pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { return ptep_get(ptep); } #endif #ifndef __HAVE_ARCH_GIGANTIC_PAGE_RUNTIME_SUPPORTED static inline bool gigantic_page_runtime_supported(void) { return IS_ENABLED(CONFIG_ARCH_HAS_GIGANTIC_PAGE); } #endif /* __HAVE_ARCH_GIGANTIC_PAGE_RUNTIME_SUPPORTED */ #endif /* _ASM_GENERIC_HUGETLB_H */
95 95 12 12 9 1 8 353 352 353 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 // SPDX-License-Identifier: GPL-2.0-only /* * 32bit compatibility wrappers for the input subsystem. * * Very heavily based on evdev.c - Copyright (c) 1999-2002 Vojtech Pavlik */ #include <linux/export.h> #include <linux/sprintf.h> #include <linux/uaccess.h> #include "input-compat.h" #ifdef CONFIG_COMPAT int input_event_from_user(const char __user *buffer, struct input_event *event) { if (in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { struct input_event_compat compat_event; if (copy_from_user(&compat_event, buffer, sizeof(struct input_event_compat))) return -EFAULT; event->input_event_sec = compat_event.sec; event->input_event_usec = compat_event.usec; event->type = compat_event.type; event->code = compat_event.code; event->value = compat_event.value; } else { if (copy_from_user(event, buffer, sizeof(struct input_event))) return -EFAULT; } return 0; } int input_event_to_user(char __user *buffer, const struct input_event *event) { if (in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { struct input_event_compat compat_event; compat_event.sec = event->input_event_sec; compat_event.usec = event->input_event_usec; compat_event.type = event->type; compat_event.code = event->code; compat_event.value = event->value; if (copy_to_user(buffer, &compat_event, sizeof(struct input_event_compat))) return -EFAULT; } else { if (copy_to_user(buffer, event, sizeof(struct input_event))) return -EFAULT; } return 0; } int input_ff_effect_from_user(const char __user *buffer, size_t size, struct ff_effect *effect) { if (in_compat_syscall()) { struct ff_effect_compat *compat_effect; if (size != sizeof(struct ff_effect_compat)) return -EINVAL; /* * It so happens that the pointer which needs to be changed * is the last field in the structure, so we can retrieve the * whole thing and replace just the pointer. */ compat_effect = (struct ff_effect_compat *)effect; if (copy_from_user(compat_effect, buffer, sizeof(struct ff_effect_compat))) return -EFAULT; if (compat_effect->type == FF_PERIODIC && compat_effect->u.periodic.waveform == FF_CUSTOM) effect->u.periodic.custom_data = compat_ptr(compat_effect->u.periodic.custom_data); } else { if (size != sizeof(struct ff_effect)) return -EINVAL; if (copy_from_user(effect, buffer, sizeof(struct ff_effect))) return -EFAULT; } return 0; } int input_bits_to_string(char *buf, int buf_size, unsigned long bits, bool skip_empty) { int len = 0; if (in_compat_syscall()) { u32 dword = bits >> 32; if (dword || !skip_empty) len += snprintf(buf, buf_size, "%x ", dword); dword = bits & 0xffffffffUL; if (dword || !skip_empty || len) len += snprintf(buf + len, max(buf_size - len, 0), "%x", dword); } else { if (bits || !skip_empty) len += snprintf(buf, buf_size, "%lx", bits); } return len; } #else int input_event_from_user(const char __user *buffer, struct input_event *event) { if (copy_from_user(event, buffer, sizeof(struct input_event))) return -EFAULT; return 0; } int input_event_to_user(char __user *buffer, const struct input_event *event) { if (copy_to_user(buffer, event, sizeof(struct input_event))) return -EFAULT; return 0; } int input_ff_effect_from_user(const char __user *buffer, size_t size, struct ff_effect *effect) { if (size != sizeof(struct ff_effect)) return -EINVAL; if (copy_from_user(effect, buffer, sizeof(struct ff_effect))) return -EFAULT; return 0; } int input_bits_to_string(char *buf, int buf_size, unsigned long bits, bool skip_empty) { return bits || !skip_empty ? snprintf(buf, buf_size, "%lx", bits) : 0; } #endif /* CONFIG_COMPAT */ EXPORT_SYMBOL_GPL(input_event_from_user); EXPORT_SYMBOL_GPL(input_event_to_user); EXPORT_SYMBOL_GPL(input_ff_effect_from_user);
80 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) ST-Ericsson AB 2010 * Author: Sjur Brendeland */ #define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ #include <linux/kernel.h> #include <linux/types.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/pkt_sched.h> #include <net/caif/caif_layer.h> #include <net/caif/cfsrvl.h> #include <net/caif/cfpkt.h> #include <net/caif/caif_dev.h> #define SRVL_CTRL_PKT_SIZE 1 #define SRVL_FLOW_OFF 0x81 #define SRVL_FLOW_ON 0x80 #define SRVL_SET_PIN 0x82 #define container_obj(layr) container_of(layr, struct cfsrvl, layer) static void cfservl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, int phyid) { struct cfsrvl *service = container_obj(layr); if (layr->up == NULL || layr->up->ctrlcmd == NULL) return; switch (ctrl) { case CAIF_CTRLCMD_INIT_RSP: service->open = true; layr->up->ctrlcmd(layr->up, ctrl, phyid); break; case CAIF_CTRLCMD_DEINIT_RSP: case CAIF_CTRLCMD_INIT_FAIL_RSP: service->open = false; layr->up->ctrlcmd(layr->up, ctrl, phyid); break; case _CAIF_CTRLCMD_PHYIF_FLOW_OFF_IND: if (phyid != service->dev_info.id) break; if (service->modem_flow_on) layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_FLOW_OFF_IND, phyid); service->phy_flow_on = false; break; case _CAIF_CTRLCMD_PHYIF_FLOW_ON_IND: if (phyid != service->dev_info.id) return; if (service->modem_flow_on) { layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_FLOW_ON_IND, phyid); } service->phy_flow_on = true; break; case CAIF_CTRLCMD_FLOW_OFF_IND: if (service->phy_flow_on) { layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_FLOW_OFF_IND, phyid); } service->modem_flow_on = false; break; case CAIF_CTRLCMD_FLOW_ON_IND: if (service->phy_flow_on) { layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_FLOW_ON_IND, phyid); } service->modem_flow_on = true; break; case _CAIF_CTRLCMD_PHYIF_DOWN_IND: /* In case interface is down, let's fake a remove shutdown */ layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND, phyid); break; case CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND: layr->up->ctrlcmd(layr->up, ctrl, phyid); break; default: pr_warn("Unexpected ctrl in cfsrvl (%d)\n", ctrl); /* We have both modem and phy flow on, send flow on */ layr->up->ctrlcmd(layr->up, ctrl, phyid); service->phy_flow_on = true; break; } } static int cfservl_modemcmd(struct cflayer *layr, enum caif_modemcmd ctrl) { struct cfsrvl *service = container_obj(layr); caif_assert(layr != NULL); caif_assert(layr->dn != NULL); caif_assert(layr->dn->transmit != NULL); if (!service->supports_flowctrl) return 0; switch (ctrl) { case CAIF_MODEMCMD_FLOW_ON_REQ: { struct cfpkt *pkt; struct caif_payload_info *info; u8 flow_on = SRVL_FLOW_ON; pkt = cfpkt_create(SRVL_CTRL_PKT_SIZE); if (!pkt) return -ENOMEM; if (cfpkt_add_head(pkt, &flow_on, 1) < 0) { pr_err("Packet is erroneous!\n"); cfpkt_destroy(pkt); return -EPROTO; } info = cfpkt_info(pkt); info->channel_id = service->layer.id; info->hdr_len = 1; info->dev_info = &service->dev_info; cfpkt_set_prio(pkt, TC_PRIO_CONTROL); return layr->dn->transmit(layr->dn, pkt); } case CAIF_MODEMCMD_FLOW_OFF_REQ: { struct cfpkt *pkt; struct caif_payload_info *info; u8 flow_off = SRVL_FLOW_OFF; pkt = cfpkt_create(SRVL_CTRL_PKT_SIZE); if (!pkt) return -ENOMEM; if (cfpkt_add_head(pkt, &flow_off, 1) < 0) { pr_err("Packet is erroneous!\n"); cfpkt_destroy(pkt); return -EPROTO; } info = cfpkt_info(pkt); info->channel_id = service->layer.id; info->hdr_len = 1; info->dev_info = &service->dev_info; cfpkt_set_prio(pkt, TC_PRIO_CONTROL); return layr->dn->transmit(layr->dn, pkt); } default: break; } return -EINVAL; } static void cfsrvl_release(struct cflayer *layer) { struct cfsrvl *service = container_of(layer, struct cfsrvl, layer); kfree(service); } void cfsrvl_init(struct cfsrvl *service, u8 channel_id, struct dev_info *dev_info, bool supports_flowctrl) { caif_assert(offsetof(struct cfsrvl, layer) == 0); service->open = false; service->modem_flow_on = true; service->phy_flow_on = true; service->layer.id = channel_id; service->layer.ctrlcmd = cfservl_ctrlcmd; service->layer.modemcmd = cfservl_modemcmd; service->dev_info = *dev_info; service->supports_flowctrl = supports_flowctrl; service->release = cfsrvl_release; } bool cfsrvl_ready(struct cfsrvl *service, int *err) { if (!service->open) { *err = -ENOTCONN; return false; } return true; } bool cfsrvl_phyid_match(struct cflayer *layer, int phyid) { struct cfsrvl *servl = container_obj(layer); return servl->dev_info.id == phyid; } void caif_free_client(struct cflayer *adap_layer) { struct cfsrvl *servl; if (adap_layer == NULL || adap_layer->dn == NULL) return; servl = container_obj(adap_layer->dn); servl->release(&servl->layer); } EXPORT_SYMBOL(caif_free_client); void caif_client_register_refcnt(struct cflayer *adapt_layer, void (*hold)(struct cflayer *lyr), void (*put)(struct cflayer *lyr)) { struct cfsrvl *service; if (WARN_ON(adapt_layer == NULL || adapt_layer->dn == NULL)) return; service = container_of(adapt_layer->dn, struct cfsrvl, layer); service->hold = hold; service->put = put; } EXPORT_SYMBOL(caif_client_register_refcnt);
2387 2480 2478 2479 2476 3 2480 2479 977 8 978 15 4 965 1 2 975 2465 2 3 2 4 2470 1 1 2 1 1 2475 3 1 2476 1 1 5 4 5 5 2470 4 2 2461 2 2466 1642 1646 2463 2479 51 4 2429 1 1682 11 11 1 1907 2403 48 48 6 2429 46 1645 2477 2478 2427 4 1645 29 12 1 2389 2387 2 2388 7 1 6 2 1 1 1 2477 9 1 1890 963 6 2 2393 7 2411 1 12 2468 2478 2387 91 2480 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 // SPDX-License-Identifier: GPL-2.0-or-later /* * x86 instruction analysis * * Copyright (C) IBM Corporation, 2002, 2004, 2009 */ #include <linux/kernel.h> #ifdef __KERNEL__ #include <linux/string.h> #else #include <string.h> #endif #include <asm/inat.h> /*__ignore_sync_check__ */ #include <asm/insn.h> /* __ignore_sync_check__ */ #include <linux/unaligned.h> /* __ignore_sync_check__ */ #include <linux/errno.h> #include <linux/kconfig.h> #include <asm/emulate_prefix.h> /* __ignore_sync_check__ */ #define leXX_to_cpu(t, r) \ ({ \ __typeof__(t) v; \ switch (sizeof(t)) { \ case 4: v = le32_to_cpu(r); break; \ case 2: v = le16_to_cpu(r); break; \ case 1: v = r; break; \ default: \ BUILD_BUG(); break; \ } \ v; \ }) /* Verify next sizeof(t) bytes can be on the same instruction */ #define validate_next(t, insn, n) \ ((insn)->next_byte + sizeof(t) + n <= (insn)->end_kaddr) #define __get_next(t, insn) \ ({ t r = get_unaligned((t *)(insn)->next_byte); (insn)->next_byte += sizeof(t); leXX_to_cpu(t, r); }) #define __peek_nbyte_next(t, insn, n) \ ({ t r = get_unaligned((t *)(insn)->next_byte + n); leXX_to_cpu(t, r); }) #define get_next(t, insn) \ ({ if (unlikely(!validate_next(t, insn, 0))) goto err_out; __get_next(t, insn); }) #define peek_nbyte_next(t, insn, n) \ ({ if (unlikely(!validate_next(t, insn, n))) goto err_out; __peek_nbyte_next(t, insn, n); }) #define peek_next(t, insn) peek_nbyte_next(t, insn, 0) /** * insn_init() - initialize struct insn * @insn: &struct insn to be initialized * @kaddr: address (in kernel memory) of instruction (or copy thereof) * @buf_len: length of the insn buffer at @kaddr * @x86_64: !0 for 64-bit kernel or 64-bit app */ void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64) { /* * Instructions longer than MAX_INSN_SIZE (15 bytes) are invalid * even if the input buffer is long enough to hold them. */ if (buf_len > MAX_INSN_SIZE) buf_len = MAX_INSN_SIZE; memset(insn, 0, sizeof(*insn)); insn->kaddr = kaddr; insn->end_kaddr = kaddr + buf_len; insn->next_byte = kaddr; insn->x86_64 = x86_64; insn->opnd_bytes = 4; if (x86_64) insn->addr_bytes = 8; else insn->addr_bytes = 4; } static const insn_byte_t xen_prefix[] = { __XEN_EMULATE_PREFIX }; static const insn_byte_t kvm_prefix[] = { __KVM_EMULATE_PREFIX }; static int __insn_get_emulate_prefix(struct insn *insn, const insn_byte_t *prefix, size_t len) { size_t i; for (i = 0; i < len; i++) { if (peek_nbyte_next(insn_byte_t, insn, i) != prefix[i]) goto err_out; } insn->emulate_prefix_size = len; insn->next_byte += len; return 1; err_out: return 0; } static void insn_get_emulate_prefix(struct insn *insn) { if (__insn_get_emulate_prefix(insn, xen_prefix, sizeof(xen_prefix))) return; __insn_get_emulate_prefix(insn, kvm_prefix, sizeof(kvm_prefix)); } /** * insn_get_prefixes - scan x86 instruction prefix bytes * @insn: &struct insn containing instruction * * Populates the @insn->prefixes bitmap, and updates @insn->next_byte * to point to the (first) opcode. No effect if @insn->prefixes.got * is already set. * * * Returns: * 0: on success * < 0: on error */ int insn_get_prefixes(struct insn *insn) { struct insn_field *prefixes = &insn->prefixes; insn_attr_t attr; insn_byte_t b, lb; int i, nb; if (prefixes->got) return 0; insn_get_emulate_prefix(insn); nb = 0; lb = 0; b = peek_next(insn_byte_t, insn); attr = inat_get_opcode_attribute(b); while (inat_is_legacy_prefix(attr)) { /* Skip if same prefix */ for (i = 0; i < nb; i++) if (prefixes->bytes[i] == b) goto found; if (nb == 4) /* Invalid instruction */ break; prefixes->bytes[nb++] = b; if (inat_is_address_size_prefix(attr)) { /* address size switches 2/4 or 4/8 */ if (insn->x86_64) insn->addr_bytes ^= 12; else insn->addr_bytes ^= 6; } else if (inat_is_operand_size_prefix(attr)) { /* oprand size switches 2/4 */ insn->opnd_bytes ^= 6; } found: prefixes->nbytes++; insn->next_byte++; lb = b; b = peek_next(insn_byte_t, insn); attr = inat_get_opcode_attribute(b); } /* Set the last prefix */ if (lb && lb != insn->prefixes.bytes[3]) { if (unlikely(insn->prefixes.bytes[3])) { /* Swap the last prefix */ b = insn->prefixes.bytes[3]; for (i = 0; i < nb; i++) if (prefixes->bytes[i] == lb) insn_set_byte(prefixes, i, b); } insn_set_byte(&insn->prefixes, 3, lb); } /* Decode REX prefix */ if (insn->x86_64) { b = peek_next(insn_byte_t, insn); attr = inat_get_opcode_attribute(b); if (inat_is_rex_prefix(attr)) { insn_field_set(&insn->rex_prefix, b, 1); insn->next_byte++; if (X86_REX_W(b)) /* REX.W overrides opnd_size */ insn->opnd_bytes = 8; } else if (inat_is_rex2_prefix(attr)) { insn_set_byte(&insn->rex_prefix, 0, b); b = peek_nbyte_next(insn_byte_t, insn, 1); insn_set_byte(&insn->rex_prefix, 1, b); insn->rex_prefix.nbytes = 2; insn->next_byte += 2; if (X86_REX_W(b)) /* REX.W overrides opnd_size */ insn->opnd_bytes = 8; insn->rex_prefix.got = 1; goto vex_end; } } insn->rex_prefix.got = 1; /* Decode VEX/XOP prefix */ b = peek_next(insn_byte_t, insn); if (inat_is_vex_prefix(attr) || inat_is_xop_prefix(attr)) { insn_byte_t b2 = peek_nbyte_next(insn_byte_t, insn, 1); if (inat_is_xop_prefix(attr) && X86_MODRM_REG(b2) == 0) { /* Grp1A.0 is always POP Ev */ goto vex_end; } else if (!insn->x86_64) { /* * In 32-bits mode, if the [7:6] bits (mod bits of * ModRM) on the second byte are not 11b, it is * LDS or LES or BOUND. */ if (X86_MODRM_MOD(b2) != 3) goto vex_end; } insn_set_byte(&insn->vex_prefix, 0, b); insn_set_byte(&insn->vex_prefix, 1, b2); if (inat_is_evex_prefix(attr)) { b2 = peek_nbyte_next(insn_byte_t, insn, 2); insn_set_byte(&insn->vex_prefix, 2, b2); b2 = peek_nbyte_next(insn_byte_t, insn, 3); insn_set_byte(&insn->vex_prefix, 3, b2); insn->vex_prefix.nbytes = 4; insn->next_byte += 4; if (insn->x86_64 && X86_VEX_W(b2)) /* VEX.W overrides opnd_size */ insn->opnd_bytes = 8; } else if (inat_is_vex3_prefix(attr) || inat_is_xop_prefix(attr)) { b2 = peek_nbyte_next(insn_byte_t, insn, 2); insn_set_byte(&insn->vex_prefix, 2, b2); insn->vex_prefix.nbytes = 3; insn->next_byte += 3; if (insn->x86_64 && X86_VEX_W(b2)) /* VEX.W/XOP.W overrides opnd_size */ insn->opnd_bytes = 8; } else { /* * For VEX2, fake VEX3-like byte#2. * Makes it easier to decode vex.W, vex.vvvv, * vex.L and vex.pp. Masking with 0x7f sets vex.W == 0. */ insn_set_byte(&insn->vex_prefix, 2, b2 & 0x7f); insn->vex_prefix.nbytes = 2; insn->next_byte += 2; } } vex_end: insn->vex_prefix.got = 1; prefixes->got = 1; return 0; err_out: return -ENODATA; } /** * insn_get_opcode - collect opcode(s) * @insn: &struct insn containing instruction * * Populates @insn->opcode, updates @insn->next_byte to point past the * opcode byte(s), and set @insn->attr (except for groups). * If necessary, first collects any preceding (prefix) bytes. * Sets @insn->opcode.value = opcode1. No effect if @insn->opcode.got * is already 1. * * Returns: * 0: on success * < 0: on error */ int insn_get_opcode(struct insn *insn) { struct insn_field *opcode = &insn->opcode; int pfx_id, ret; insn_byte_t op; if (opcode->got) return 0; ret = insn_get_prefixes(insn); if (ret) return ret; /* Get first opcode */ op = get_next(insn_byte_t, insn); insn_set_byte(opcode, 0, op); opcode->nbytes = 1; /* Check if there is VEX/XOP prefix or not */ if (insn_is_avx_or_xop(insn)) { insn_byte_t m, p; /* XOP prefix has different encoding */ if (unlikely(avx_insn_is_xop(insn))) { m = insn_xop_map_bits(insn); insn->attr = inat_get_xop_attribute(op, m); if (!inat_accept_xop(insn->attr)) { insn->attr = 0; return -EINVAL; } /* XOP has only 1 byte for opcode */ goto end; } m = insn_vex_m_bits(insn); p = insn_vex_p_bits(insn); insn->attr = inat_get_avx_attribute(op, m, p); /* SCALABLE EVEX uses p bits to encode operand size */ if (inat_evex_scalable(insn->attr) && !insn_vex_w_bit(insn) && p == INAT_PFX_OPNDSZ) insn->opnd_bytes = 2; if ((inat_must_evex(insn->attr) && !insn_is_evex(insn)) || (!inat_accept_vex(insn->attr) && !inat_is_group(insn->attr))) { /* This instruction is bad */ insn->attr = 0; return -EINVAL; } /* VEX has only 1 byte for opcode */ goto end; } /* Check if there is REX2 prefix or not */ if (insn_is_rex2(insn)) { if (insn_rex2_m_bit(insn)) { /* map 1 is escape 0x0f */ insn_attr_t esc_attr = inat_get_opcode_attribute(0x0f); pfx_id = insn_last_prefix_id(insn); insn->attr = inat_get_escape_attribute(op, pfx_id, esc_attr); } else { insn->attr = inat_get_opcode_attribute(op); } goto end; } insn->attr = inat_get_opcode_attribute(op); if (insn->x86_64 && inat_is_invalid64(insn->attr)) { /* This instruction is invalid, like UD2. Stop decoding. */ insn->attr &= INAT_INV64; } while (inat_is_escape(insn->attr)) { /* Get escaped opcode */ op = get_next(insn_byte_t, insn); opcode->bytes[opcode->nbytes++] = op; pfx_id = insn_last_prefix_id(insn); insn->attr = inat_get_escape_attribute(op, pfx_id, insn->attr); } if (inat_must_vex(insn->attr)) { /* This instruction is bad */ insn->attr = 0; return -EINVAL; } end: opcode->got = 1; return 0; err_out: return -ENODATA; } /** * insn_get_modrm - collect ModRM byte, if any * @insn: &struct insn containing instruction * * Populates @insn->modrm and updates @insn->next_byte to point past the * ModRM byte, if any. If necessary, first collects the preceding bytes * (prefixes and opcode(s)). No effect if @insn->modrm.got is already 1. * * Returns: * 0: on success * < 0: on error */ int insn_get_modrm(struct insn *insn) { struct insn_field *modrm = &insn->modrm; insn_byte_t pfx_id, mod; int ret; if (modrm->got) return 0; ret = insn_get_opcode(insn); if (ret) return ret; if (inat_has_modrm(insn->attr)) { mod = get_next(insn_byte_t, insn); insn_field_set(modrm, mod, 1); if (inat_is_group(insn->attr)) { pfx_id = insn_last_prefix_id(insn); insn->attr = inat_get_group_attribute(mod, pfx_id, insn->attr); if (insn_is_avx_or_xop(insn) && !inat_accept_vex(insn->attr) && !inat_accept_xop(insn->attr)) { /* Bad insn */ insn->attr = 0; return -EINVAL; } } } if (insn->x86_64 && inat_is_force64(insn->attr)) insn->opnd_bytes = 8; modrm->got = 1; return 0; err_out: return -ENODATA; } /** * insn_rip_relative() - Does instruction use RIP-relative addressing mode? * @insn: &struct insn containing instruction * * If necessary, first collects the instruction up to and including the * ModRM byte. No effect if @insn->x86_64 is 0. */ int insn_rip_relative(struct insn *insn) { struct insn_field *modrm = &insn->modrm; int ret; if (!insn->x86_64) return 0; ret = insn_get_modrm(insn); if (ret) return 0; /* * For rip-relative instructions, the mod field (top 2 bits) * is zero and the r/m field (bottom 3 bits) is 0x5. */ return (modrm->nbytes && (modrm->bytes[0] & 0xc7) == 0x5); } /** * insn_get_sib() - Get the SIB byte of instruction * @insn: &struct insn containing instruction * * If necessary, first collects the instruction up to and including the * ModRM byte. * * Returns: * 0: if decoding succeeded * < 0: otherwise. */ int insn_get_sib(struct insn *insn) { insn_byte_t modrm; int ret; if (insn->sib.got) return 0; ret = insn_get_modrm(insn); if (ret) return ret; if (insn->modrm.nbytes) { modrm = insn->modrm.bytes[0]; if (insn->addr_bytes != 2 && X86_MODRM_MOD(modrm) != 3 && X86_MODRM_RM(modrm) == 4) { insn_field_set(&insn->sib, get_next(insn_byte_t, insn), 1); } } insn->sib.got = 1; return 0; err_out: return -ENODATA; } /** * insn_get_displacement() - Get the displacement of instruction * @insn: &struct insn containing instruction * * If necessary, first collects the instruction up to and including the * SIB byte. * Displacement value is sign-expanded. * * * Returns: * 0: if decoding succeeded * < 0: otherwise. */ int insn_get_displacement(struct insn *insn) { insn_byte_t mod, rm, base; int ret; if (insn->displacement.got) return 0; ret = insn_get_sib(insn); if (ret) return ret; if (insn->modrm.nbytes) { /* * Interpreting the modrm byte: * mod = 00 - no displacement fields (exceptions below) * mod = 01 - 1-byte displacement field * mod = 10 - displacement field is 4 bytes, or 2 bytes if * address size = 2 (0x67 prefix in 32-bit mode) * mod = 11 - no memory operand * * If address size = 2... * mod = 00, r/m = 110 - displacement field is 2 bytes * * If address size != 2... * mod != 11, r/m = 100 - SIB byte exists * mod = 00, SIB base = 101 - displacement field is 4 bytes * mod = 00, r/m = 101 - rip-relative addressing, displacement * field is 4 bytes */ mod = X86_MODRM_MOD(insn->modrm.value); rm = X86_MODRM_RM(insn->modrm.value); base = X86_SIB_BASE(insn->sib.value); if (mod == 3) goto out; if (mod == 1) { insn_field_set(&insn->displacement, get_next(signed char, insn), 1); } else if (insn->addr_bytes == 2) { if ((mod == 0 && rm == 6) || mod == 2) { insn_field_set(&insn->displacement, get_next(short, insn), 2); } } else { if ((mod == 0 && rm == 5) || mod == 2 || (mod == 0 && base == 5)) { insn_field_set(&insn->displacement, get_next(int, insn), 4); } } } out: insn->displacement.got = 1; return 0; err_out: return -ENODATA; } /* Decode moffset16/32/64. Return 0 if failed */ static int __get_moffset(struct insn *insn) { switch (insn->addr_bytes) { case 2: insn_field_set(&insn->moffset1, get_next(short, insn), 2); break; case 4: insn_field_set(&insn->moffset1, get_next(int, insn), 4); break; case 8: insn_field_set(&insn->moffset1, get_next(int, insn), 4); insn_field_set(&insn->moffset2, get_next(int, insn), 4); break; default: /* opnd_bytes must be modified manually */ goto err_out; } insn->moffset1.got = insn->moffset2.got = 1; return 1; err_out: return 0; } /* Decode imm v32(Iz). Return 0 if failed */ static int __get_immv32(struct insn *insn) { switch (insn->opnd_bytes) { case 2: insn_field_set(&insn->immediate, get_next(short, insn), 2); break; case 4: case 8: insn_field_set(&insn->immediate, get_next(int, insn), 4); break; default: /* opnd_bytes must be modified manually */ goto err_out; } return 1; err_out: return 0; } /* Decode imm v64(Iv/Ov), Return 0 if failed */ static int __get_immv(struct insn *insn) { switch (insn->opnd_bytes) { case 2: insn_field_set(&insn->immediate1, get_next(short, insn), 2); break; case 4: insn_field_set(&insn->immediate1, get_next(int, insn), 4); insn->immediate1.nbytes = 4; break; case 8: insn_field_set(&insn->immediate1, get_next(int, insn), 4); insn_field_set(&insn->immediate2, get_next(int, insn), 4); break; default: /* opnd_bytes must be modified manually */ goto err_out; } insn->immediate1.got = insn->immediate2.got = 1; return 1; err_out: return 0; } /* Decode ptr16:16/32(Ap) */ static int __get_immptr(struct insn *insn) { switch (insn->opnd_bytes) { case 2: insn_field_set(&insn->immediate1, get_next(short, insn), 2); break; case 4: insn_field_set(&insn->immediate1, get_next(int, insn), 4); break; case 8: /* ptr16:64 is not exist (no segment) */ return 0; default: /* opnd_bytes must be modified manually */ goto err_out; } insn_field_set(&insn->immediate2, get_next(unsigned short, insn), 2); insn->immediate1.got = insn->immediate2.got = 1; return 1; err_out: return 0; } /** * insn_get_immediate() - Get the immediate in an instruction * @insn: &struct insn containing instruction * * If necessary, first collects the instruction up to and including the * displacement bytes. * Basically, most of immediates are sign-expanded. Unsigned-value can be * computed by bit masking with ((1 << (nbytes * 8)) - 1) * * Returns: * 0: on success * < 0: on error */ int insn_get_immediate(struct insn *insn) { int ret; if (insn->immediate.got) return 0; ret = insn_get_displacement(insn); if (ret) return ret; if (inat_has_moffset(insn->attr)) { if (!__get_moffset(insn)) goto err_out; goto done; } if (!inat_has_immediate(insn->attr)) goto done; switch (inat_immediate_size(insn->attr)) { case INAT_IMM_BYTE: insn_field_set(&insn->immediate, get_next(signed char, insn), 1); break; case INAT_IMM_WORD: insn_field_set(&insn->immediate, get_next(short, insn), 2); break; case INAT_IMM_DWORD: insn_field_set(&insn->immediate, get_next(int, insn), 4); break; case INAT_IMM_QWORD: insn_field_set(&insn->immediate1, get_next(int, insn), 4); insn_field_set(&insn->immediate2, get_next(int, insn), 4); break; case INAT_IMM_PTR: if (!__get_immptr(insn)) goto err_out; break; case INAT_IMM_VWORD32: if (!__get_immv32(insn)) goto err_out; break; case INAT_IMM_VWORD: if (!__get_immv(insn)) goto err_out; break; default: /* Here, insn must have an immediate, but failed */ goto err_out; } if (inat_has_second_immediate(insn->attr)) { insn_field_set(&insn->immediate2, get_next(signed char, insn), 1); } done: insn->immediate.got = 1; return 0; err_out: return -ENODATA; } /** * insn_get_length() - Get the length of instruction * @insn: &struct insn containing instruction * * If necessary, first collects the instruction up to and including the * immediates bytes. * * Returns: * - 0 on success * - < 0 on error */ int insn_get_length(struct insn *insn) { int ret; if (insn->length) return 0; ret = insn_get_immediate(insn); if (ret) return ret; insn->length = (unsigned char)((unsigned long)insn->next_byte - (unsigned long)insn->kaddr); return 0; } /* Ensure this instruction is decoded completely */ static inline int insn_complete(struct insn *insn) { return insn->opcode.got && insn->modrm.got && insn->sib.got && insn->displacement.got && insn->immediate.got; } /** * insn_decode() - Decode an x86 instruction * @insn: &struct insn to be initialized * @kaddr: address (in kernel memory) of instruction (or copy thereof) * @buf_len: length of the insn buffer at @kaddr * @m: insn mode, see enum insn_mode * * Returns: * 0: if decoding succeeded * < 0: otherwise. */ int insn_decode(struct insn *insn, const void *kaddr, int buf_len, enum insn_mode m) { int ret; /* #define INSN_MODE_KERN -1 __ignore_sync_check__ mode is only valid in the kernel */ if (m == INSN_MODE_KERN) insn_init(insn, kaddr, buf_len, IS_ENABLED(CONFIG_X86_64)); else insn_init(insn, kaddr, buf_len, m == INSN_MODE_64); ret = insn_get_length(insn); if (ret) return ret; if (insn_complete(insn)) return 0; return -EINVAL; }
8 4 1 3 59 59 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2019 Facebook */ #include <linux/hash.h> #include <linux/bpf.h> #include <linux/filter.h> #include <linux/ftrace.h> #include <linux/rbtree_latch.h> #include <linux/perf_event.h> #include <linux/btf.h> #include <linux/rcupdate_trace.h> #include <linux/rcupdate_wait.h> #include <linux/static_call.h> #include <linux/bpf_verifier.h> #include <linux/bpf_lsm.h> #include <linux/delay.h> /* dummy _ops. The verifier will operate on target program's ops. */ const struct bpf_verifier_ops bpf_extension_verifier_ops = { }; const struct bpf_prog_ops bpf_extension_prog_ops = { }; /* btf_vmlinux has ~22k attachable functions. 1k htab is enough. */ #define TRAMPOLINE_HASH_BITS 10 #define TRAMPOLINE_TABLE_SIZE (1 << TRAMPOLINE_HASH_BITS) static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE]; /* serializes access to trampoline_table */ static DEFINE_MUTEX(trampoline_mutex); #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex); static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, enum ftrace_ops_cmd cmd) { struct bpf_trampoline *tr = ops->private; int ret = 0; if (cmd == FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF) { /* This is called inside register_ftrace_direct_multi(), so * tr->mutex is already locked. */ lockdep_assert_held_once(&tr->mutex); /* Instead of updating the trampoline here, we propagate * -EAGAIN to register_ftrace_direct(). Then we can * retry register_ftrace_direct() after updating the * trampoline. */ if ((tr->flags & BPF_TRAMP_F_CALL_ORIG) && !(tr->flags & BPF_TRAMP_F_ORIG_STACK)) { if (WARN_ON_ONCE(tr->flags & BPF_TRAMP_F_SHARE_IPMODIFY)) return -EBUSY; tr->flags |= BPF_TRAMP_F_SHARE_IPMODIFY; return -EAGAIN; } return 0; } /* The normal locking order is * tr->mutex => direct_mutex (ftrace.c) => ftrace_lock (ftrace.c) * * The following two commands are called from * * prepare_direct_functions_for_ipmodify * cleanup_direct_functions_after_ipmodify * * In both cases, direct_mutex is already locked. Use * mutex_trylock(&tr->mutex) to avoid deadlock in race condition * (something else is making changes to this same trampoline). */ if (!mutex_trylock(&tr->mutex)) { /* sleep 1 ms to make sure whatever holding tr->mutex makes * some progress. */ msleep(1); return -EAGAIN; } switch (cmd) { case FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER: tr->flags |= BPF_TRAMP_F_SHARE_IPMODIFY; if ((tr->flags & BPF_TRAMP_F_CALL_ORIG) && !(tr->flags & BPF_TRAMP_F_ORIG_STACK)) ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */); break; case FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER: tr->flags &= ~BPF_TRAMP_F_SHARE_IPMODIFY; if (tr->flags & BPF_TRAMP_F_ORIG_STACK) ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */); break; default: ret = -EINVAL; break; } mutex_unlock(&tr->mutex); return ret; } #endif bool bpf_prog_has_trampoline(const struct bpf_prog *prog) { enum bpf_attach_type eatype = prog->expected_attach_type; enum bpf_prog_type ptype = prog->type; return (ptype == BPF_PROG_TYPE_TRACING && (eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT || eatype == BPF_MODIFY_RETURN)) || (ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC); } void bpf_image_ksym_init(void *data, unsigned int size, struct bpf_ksym *ksym) { ksym->start = (unsigned long) data; ksym->end = ksym->start + size; } void bpf_image_ksym_add(struct bpf_ksym *ksym) { bpf_ksym_add(ksym); perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, ksym->start, PAGE_SIZE, false, ksym->name); } void bpf_image_ksym_del(struct bpf_ksym *ksym) { bpf_ksym_del(ksym); perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, ksym->start, PAGE_SIZE, true, ksym->name); } static struct bpf_trampoline *bpf_trampoline_lookup(u64 key) { struct bpf_trampoline *tr; struct hlist_head *head; int i; mutex_lock(&trampoline_mutex); head = &trampoline_table[hash_64(key, TRAMPOLINE_HASH_BITS)]; hlist_for_each_entry(tr, head, hlist) { if (tr->key == key) { refcount_inc(&tr->refcnt); goto out; } } tr = kzalloc(sizeof(*tr), GFP_KERNEL); if (!tr) goto out; #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS tr->fops = kzalloc(sizeof(struct ftrace_ops), GFP_KERNEL); if (!tr->fops) { kfree(tr); tr = NULL; goto out; } tr->fops->private = tr; tr->fops->ops_func = bpf_tramp_ftrace_ops_func; #endif tr->key = key; INIT_HLIST_NODE(&tr->hlist); hlist_add_head(&tr->hlist, head); refcount_set(&tr->refcnt, 1); mutex_init(&tr->mutex); for (i = 0; i < BPF_TRAMP_MAX; i++) INIT_HLIST_HEAD(&tr->progs_hlist[i]); out: mutex_unlock(&trampoline_mutex); return tr; } static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr) { void *ip = tr->func.addr; int ret; if (tr->func.ftrace_managed) ret = unregister_ftrace_direct(tr->fops, (long)old_addr, false); else ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL); return ret; } static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_addr, bool lock_direct_mutex) { void *ip = tr->func.addr; int ret; if (tr->func.ftrace_managed) { if (lock_direct_mutex) ret = modify_ftrace_direct(tr->fops, (long)new_addr); else ret = modify_ftrace_direct_nolock(tr->fops, (long)new_addr); } else { ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, new_addr); } return ret; } /* first time registering */ static int register_fentry(struct bpf_trampoline *tr, void *new_addr) { void *ip = tr->func.addr; unsigned long faddr; int ret; faddr = ftrace_location((unsigned long)ip); if (faddr) { if (!tr->fops) return -ENOTSUPP; tr->func.ftrace_managed = true; } if (tr->func.ftrace_managed) { ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 1); ret = register_ftrace_direct(tr->fops, (long)new_addr); } else { ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr); } return ret; } static struct bpf_tramp_links * bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total, bool *ip_arg) { struct bpf_tramp_link *link; struct bpf_tramp_links *tlinks; struct bpf_tramp_link **links; int kind; *total = 0; tlinks = kcalloc(BPF_TRAMP_MAX, sizeof(*tlinks), GFP_KERNEL); if (!tlinks) return ERR_PTR(-ENOMEM); for (kind = 0; kind < BPF_TRAMP_MAX; kind++) { tlinks[kind].nr_links = tr->progs_cnt[kind]; *total += tr->progs_cnt[kind]; links = tlinks[kind].links; hlist_for_each_entry(link, &tr->progs_hlist[kind], tramp_hlist) { *ip_arg |= link->link.prog->call_get_func_ip; *links++ = link; } } return tlinks; } static void bpf_tramp_image_free(struct bpf_tramp_image *im) { bpf_image_ksym_del(&im->ksym); arch_free_bpf_trampoline(im->image, im->size); bpf_jit_uncharge_modmem(im->size); percpu_ref_exit(&im->pcref); kfree_rcu(im, rcu); } static void __bpf_tramp_image_put_deferred(struct work_struct *work) { struct bpf_tramp_image *im; im = container_of(work, struct bpf_tramp_image, work); bpf_tramp_image_free(im); } /* callback, fexit step 3 or fentry step 2 */ static void __bpf_tramp_image_put_rcu(struct rcu_head *rcu) { struct bpf_tramp_image *im; im = container_of(rcu, struct bpf_tramp_image, rcu); INIT_WORK(&im->work, __bpf_tramp_image_put_deferred); schedule_work(&im->work); } /* callback, fexit step 2. Called after percpu_ref_kill confirms. */ static void __bpf_tramp_image_release(struct percpu_ref *pcref) { struct bpf_tramp_image *im; im = container_of(pcref, struct bpf_tramp_image, pcref); call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu); } /* callback, fexit or fentry step 1 */ static void __bpf_tramp_image_put_rcu_tasks(struct rcu_head *rcu) { struct bpf_tramp_image *im; im = container_of(rcu, struct bpf_tramp_image, rcu); if (im->ip_after_call) /* the case of fmod_ret/fexit trampoline and CONFIG_PREEMPTION=y */ percpu_ref_kill(&im->pcref); else /* the case of fentry trampoline */ call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu); } static void bpf_tramp_image_put(struct bpf_tramp_image *im) { /* The trampoline image that calls original function is using: * rcu_read_lock_trace to protect sleepable bpf progs * rcu_read_lock to protect normal bpf progs * percpu_ref to protect trampoline itself * rcu tasks to protect trampoline asm not covered by percpu_ref * (which are few asm insns before __bpf_tramp_enter and * after __bpf_tramp_exit) * * The trampoline is unreachable before bpf_tramp_image_put(). * * First, patch the trampoline to avoid calling into fexit progs. * The progs will be freed even if the original function is still * executing or sleeping. * In case of CONFIG_PREEMPT=y use call_rcu_tasks() to wait on * first few asm instructions to execute and call into * __bpf_tramp_enter->percpu_ref_get. * Then use percpu_ref_kill to wait for the trampoline and the original * function to finish. * Then use call_rcu_tasks() to make sure few asm insns in * the trampoline epilogue are done as well. * * In !PREEMPT case the task that got interrupted in the first asm * insns won't go through an RCU quiescent state which the * percpu_ref_kill will be waiting for. Hence the first * call_rcu_tasks() is not necessary. */ if (im->ip_after_call) { int err = bpf_arch_text_poke(im->ip_after_call, BPF_MOD_JUMP, NULL, im->ip_epilogue); WARN_ON(err); if (IS_ENABLED(CONFIG_TASKS_RCU)) call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu_tasks); else percpu_ref_kill(&im->pcref); return; } /* The trampoline without fexit and fmod_ret progs doesn't call original * function and doesn't use percpu_ref. * Use call_rcu_tasks_trace() to wait for sleepable progs to finish. * Then use call_rcu_tasks() to wait for the rest of trampoline asm * and normal progs. */ call_rcu_tasks_trace(&im->rcu, __bpf_tramp_image_put_rcu_tasks); } static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, int size) { struct bpf_tramp_image *im; struct bpf_ksym *ksym; void *image; int err = -ENOMEM; im = kzalloc(sizeof(*im), GFP_KERNEL); if (!im) goto out; err = bpf_jit_charge_modmem(size); if (err) goto out_free_im; im->size = size; err = -ENOMEM; im->image = image = arch_alloc_bpf_trampoline(size); if (!image) goto out_uncharge; err = percpu_ref_init(&im->pcref, __bpf_tramp_image_release, 0, GFP_KERNEL); if (err) goto out_free_image; ksym = &im->ksym; INIT_LIST_HEAD_RCU(&ksym->lnode); snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu", key); bpf_image_ksym_init(image, size, ksym); bpf_image_ksym_add(ksym); return im; out_free_image: arch_free_bpf_trampoline(im->image, im->size); out_uncharge: bpf_jit_uncharge_modmem(size); out_free_im: kfree(im); out: return ERR_PTR(err); } static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex) { struct bpf_tramp_image *im; struct bpf_tramp_links *tlinks; u32 orig_flags = tr->flags; bool ip_arg = false; int err, total, size; tlinks = bpf_trampoline_get_progs(tr, &total, &ip_arg); if (IS_ERR(tlinks)) return PTR_ERR(tlinks); if (total == 0) { err = unregister_fentry(tr, tr->cur_image->image); bpf_tramp_image_put(tr->cur_image); tr->cur_image = NULL; goto out; } /* clear all bits except SHARE_IPMODIFY and TAIL_CALL_CTX */ tr->flags &= (BPF_TRAMP_F_SHARE_IPMODIFY | BPF_TRAMP_F_TAIL_CALL_CTX); if (tlinks[BPF_TRAMP_FEXIT].nr_links || tlinks[BPF_TRAMP_MODIFY_RETURN].nr_links) { /* NOTE: BPF_TRAMP_F_RESTORE_REGS and BPF_TRAMP_F_SKIP_FRAME * should not be set together. */ tr->flags |= BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME; } else { tr->flags |= BPF_TRAMP_F_RESTORE_REGS; } if (ip_arg) tr->flags |= BPF_TRAMP_F_IP_ARG; #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS again: if ((tr->flags & BPF_TRAMP_F_SHARE_IPMODIFY) && (tr->flags & BPF_TRAMP_F_CALL_ORIG)) tr->flags |= BPF_TRAMP_F_ORIG_STACK; #endif size = arch_bpf_trampoline_size(&tr->func.model, tr->flags, tlinks, tr->func.addr); if (size < 0) { err = size; goto out; } if (size > PAGE_SIZE) { err = -E2BIG; goto out; } im = bpf_tramp_image_alloc(tr->key, size); if (IS_ERR(im)) { err = PTR_ERR(im); goto out; } err = arch_prepare_bpf_trampoline(im, im->image, im->image + size, &tr->func.model, tr->flags, tlinks, tr->func.addr); if (err < 0) goto out_free; err = arch_protect_bpf_trampoline(im->image, im->size); if (err) goto out_free; WARN_ON(tr->cur_image && total == 0); if (tr->cur_image) /* progs already running at this address */ err = modify_fentry(tr, tr->cur_image->image, im->image, lock_direct_mutex); else /* first time registering */ err = register_fentry(tr, im->image); #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS if (err == -EAGAIN) { /* -EAGAIN from bpf_tramp_ftrace_ops_func. Now * BPF_TRAMP_F_SHARE_IPMODIFY is set, we can generate the * trampoline again, and retry register. */ /* reset fops->func and fops->trampoline for re-register */ tr->fops->func = NULL; tr->fops->trampoline = 0; /* free im memory and reallocate later */ bpf_tramp_image_free(im); goto again; } #endif if (err) goto out_free; if (tr->cur_image) bpf_tramp_image_put(tr->cur_image); tr->cur_image = im; out: /* If any error happens, restore previous flags */ if (err) tr->flags = orig_flags; kfree(tlinks); return err; out_free: bpf_tramp_image_free(im); goto out; } static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog) { switch (prog->expected_attach_type) { case BPF_TRACE_FENTRY: return BPF_TRAMP_FENTRY; case BPF_MODIFY_RETURN: return BPF_TRAMP_MODIFY_RETURN; case BPF_TRACE_FEXIT: return BPF_TRAMP_FEXIT; case BPF_LSM_MAC: if (!prog->aux->attach_func_proto->type) /* The function returns void, we cannot modify its * return value. */ return BPF_TRAMP_FEXIT; else return BPF_TRAMP_MODIFY_RETURN; default: return BPF_TRAMP_REPLACE; } } static int bpf_freplace_check_tgt_prog(struct bpf_prog *tgt_prog) { struct bpf_prog_aux *aux = tgt_prog->aux; guard(mutex)(&aux->ext_mutex); if (aux->prog_array_member_cnt) /* Program extensions can not extend target prog when the target * prog has been updated to any prog_array map as tail callee. * It's to prevent a potential infinite loop like: * tgt prog entry -> tgt prog subprog -> freplace prog entry * --tailcall-> tgt prog entry. */ return -EBUSY; aux->is_extended = true; return 0; } static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr, struct bpf_prog *tgt_prog) { enum bpf_tramp_prog_type kind; struct bpf_tramp_link *link_exiting; int err = 0; int cnt = 0, i; kind = bpf_attach_type_to_tramp(link->link.prog); if (tr->extension_prog) /* cannot attach fentry/fexit if extension prog is attached. * cannot overwrite extension prog either. */ return -EBUSY; for (i = 0; i < BPF_TRAMP_MAX; i++) cnt += tr->progs_cnt[i]; if (kind == BPF_TRAMP_REPLACE) { /* Cannot attach extension if fentry/fexit are in use. */ if (cnt) return -EBUSY; err = bpf_freplace_check_tgt_prog(tgt_prog); if (err) return err; tr->extension_prog = link->link.prog; return bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL, link->link.prog->bpf_func); } if (cnt >= BPF_MAX_TRAMP_LINKS) return -E2BIG; if (!hlist_unhashed(&link->tramp_hlist)) /* prog already linked */ return -EBUSY; hlist_for_each_entry(link_exiting, &tr->progs_hlist[kind], tramp_hlist) { if (link_exiting->link.prog != link->link.prog) continue; /* prog already linked */ return -EBUSY; } hlist_add_head(&link->tramp_hlist, &tr->progs_hlist[kind]); tr->progs_cnt[kind]++; err = bpf_trampoline_update(tr, true /* lock_direct_mutex */); if (err) { hlist_del_init(&link->tramp_hlist); tr->progs_cnt[kind]--; } return err; } int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr, struct bpf_prog *tgt_prog) { int err; mutex_lock(&tr->mutex); err = __bpf_trampoline_link_prog(link, tr, tgt_prog); mutex_unlock(&tr->mutex); return err; } static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr, struct bpf_prog *tgt_prog) { enum bpf_tramp_prog_type kind; int err; kind = bpf_attach_type_to_tramp(link->link.prog); if (kind == BPF_TRAMP_REPLACE) { WARN_ON_ONCE(!tr->extension_prog); err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, tr->extension_prog->bpf_func, NULL); tr->extension_prog = NULL; guard(mutex)(&tgt_prog->aux->ext_mutex); tgt_prog->aux->is_extended = false; return err; } hlist_del_init(&link->tramp_hlist); tr->progs_cnt[kind]--; return bpf_trampoline_update(tr, true /* lock_direct_mutex */); } /* bpf_trampoline_unlink_prog() should never fail. */ int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr, struct bpf_prog *tgt_prog) { int err; mutex_lock(&tr->mutex); err = __bpf_trampoline_unlink_prog(link, tr, tgt_prog); mutex_unlock(&tr->mutex); return err; } #if defined(CONFIG_CGROUP_BPF) && defined(CONFIG_BPF_LSM) static void bpf_shim_tramp_link_release(struct bpf_link *link) { struct bpf_shim_tramp_link *shim_link = container_of(link, struct bpf_shim_tramp_link, link.link); /* paired with 'shim_link->trampoline = tr' in bpf_trampoline_link_cgroup_shim */ if (!shim_link->trampoline) return; WARN_ON_ONCE(bpf_trampoline_unlink_prog(&shim_link->link, shim_link->trampoline, NULL)); bpf_trampoline_put(shim_link->trampoline); } static void bpf_shim_tramp_link_dealloc(struct bpf_link *link) { struct bpf_shim_tramp_link *shim_link = container_of(link, struct bpf_shim_tramp_link, link.link); kfree(shim_link); } static const struct bpf_link_ops bpf_shim_tramp_link_lops = { .release = bpf_shim_tramp_link_release, .dealloc = bpf_shim_tramp_link_dealloc, }; static struct bpf_shim_tramp_link *cgroup_shim_alloc(const struct bpf_prog *prog, bpf_func_t bpf_func, int cgroup_atype, enum bpf_attach_type attach_type) { struct bpf_shim_tramp_link *shim_link = NULL; struct bpf_prog *p; shim_link = kzalloc(sizeof(*shim_link), GFP_USER); if (!shim_link) return NULL; p = bpf_prog_alloc(1, 0); if (!p) { kfree(shim_link); return NULL; } p->jited = false; p->bpf_func = bpf_func; p->aux->cgroup_atype = cgroup_atype; p->aux->attach_func_proto = prog->aux->attach_func_proto; p->aux->attach_btf_id = prog->aux->attach_btf_id; p->aux->attach_btf = prog->aux->attach_btf; btf_get(p->aux->attach_btf); p->type = BPF_PROG_TYPE_LSM; p->expected_attach_type = BPF_LSM_MAC; bpf_prog_inc(p); bpf_link_init(&shim_link->link.link, BPF_LINK_TYPE_UNSPEC, &bpf_shim_tramp_link_lops, p, attach_type); bpf_cgroup_atype_get(p->aux->attach_btf_id, cgroup_atype); return shim_link; } static struct bpf_shim_tramp_link *cgroup_shim_find(struct bpf_trampoline *tr, bpf_func_t bpf_func) { struct bpf_tramp_link *link; int kind; for (kind = 0; kind < BPF_TRAMP_MAX; kind++) { hlist_for_each_entry(link, &tr->progs_hlist[kind], tramp_hlist) { struct bpf_prog *p = link->link.prog; if (p->bpf_func == bpf_func) return container_of(link, struct bpf_shim_tramp_link, link); } } return NULL; } int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog, int cgroup_atype, enum bpf_attach_type attach_type) { struct bpf_shim_tramp_link *shim_link = NULL; struct bpf_attach_target_info tgt_info = {}; struct bpf_trampoline *tr; bpf_func_t bpf_func; u64 key; int err; err = bpf_check_attach_target(NULL, prog, NULL, prog->aux->attach_btf_id, &tgt_info); if (err) return err; key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf, prog->aux->attach_btf_id); bpf_lsm_find_cgroup_shim(prog, &bpf_func); tr = bpf_trampoline_get(key, &tgt_info); if (!tr) return -ENOMEM; mutex_lock(&tr->mutex); shim_link = cgroup_shim_find(tr, bpf_func); if (shim_link) { /* Reusing existing shim attached by the other program. */ bpf_link_inc(&shim_link->link.link); mutex_unlock(&tr->mutex); bpf_trampoline_put(tr); /* bpf_trampoline_get above */ return 0; } /* Allocate and install new shim. */ shim_link = cgroup_shim_alloc(prog, bpf_func, cgroup_atype, attach_type); if (!shim_link) { err = -ENOMEM; goto err; } err = __bpf_trampoline_link_prog(&shim_link->link, tr, NULL); if (err) goto err; shim_link->trampoline = tr; /* note, we're still holding tr refcnt from above */ mutex_unlock(&tr->mutex); return 0; err: mutex_unlock(&tr->mutex); if (shim_link) bpf_link_put(&shim_link->link.link); /* have to release tr while _not_ holding its mutex */ bpf_trampoline_put(tr); /* bpf_trampoline_get above */ return err; } void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog) { struct bpf_shim_tramp_link *shim_link = NULL; struct bpf_trampoline *tr; bpf_func_t bpf_func; u64 key; key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf, prog->aux->attach_btf_id); bpf_lsm_find_cgroup_shim(prog, &bpf_func); tr = bpf_trampoline_lookup(key); if (WARN_ON_ONCE(!tr)) return; mutex_lock(&tr->mutex); shim_link = cgroup_shim_find(tr, bpf_func); mutex_unlock(&tr->mutex); if (shim_link) bpf_link_put(&shim_link->link.link); bpf_trampoline_put(tr); /* bpf_trampoline_lookup above */ } #endif struct bpf_trampoline *bpf_trampoline_get(u64 key, struct bpf_attach_target_info *tgt_info) { struct bpf_trampoline *tr; tr = bpf_trampoline_lookup(key); if (!tr) return NULL; mutex_lock(&tr->mutex); if (tr->func.addr) goto out; memcpy(&tr->func.model, &tgt_info->fmodel, sizeof(tgt_info->fmodel)); tr->func.addr = (void *)tgt_info->tgt_addr; out: mutex_unlock(&tr->mutex); return tr; } void bpf_trampoline_put(struct bpf_trampoline *tr) { int i; if (!tr) return; mutex_lock(&trampoline_mutex); if (!refcount_dec_and_test(&tr->refcnt)) goto out; WARN_ON_ONCE(mutex_is_locked(&tr->mutex)); for (i = 0; i < BPF_TRAMP_MAX; i++) if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[i]))) goto out; /* This code will be executed even when the last bpf_tramp_image * is alive. All progs are detached from the trampoline and the * trampoline image is patched with jmp into epilogue to skip * fexit progs. The fentry-only trampoline will be freed via * multiple rcu callbacks. */ hlist_del(&tr->hlist); if (tr->fops) { ftrace_free_filter(tr->fops); kfree(tr->fops); } kfree(tr); out: mutex_unlock(&trampoline_mutex); } #define NO_START_TIME 1 static __always_inline u64 notrace bpf_prog_start_time(void) { u64 start = NO_START_TIME; if (static_branch_unlikely(&bpf_stats_enabled_key)) { start = sched_clock(); if (unlikely(!start)) start = NO_START_TIME; } return start; } /* The logic is similar to bpf_prog_run(), but with an explicit * rcu_read_lock() and migrate_disable() which are required * for the trampoline. The macro is split into * call __bpf_prog_enter * call prog->bpf_func * call __bpf_prog_exit * * __bpf_prog_enter returns: * 0 - skip execution of the bpf prog * 1 - execute bpf prog * [2..MAX_U64] - execute bpf prog and record execution time. * This is start time. */ static u64 notrace __bpf_prog_enter_recur(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx) __acquires(RCU) { rcu_read_lock_dont_migrate(); run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { bpf_prog_inc_misses_counter(prog); if (prog->aux->recursion_detected) prog->aux->recursion_detected(prog); return 0; } return bpf_prog_start_time(); } static void notrace __update_prog_stats(struct bpf_prog *prog, u64 start) { struct bpf_prog_stats *stats; unsigned long flags; u64 duration; /* * static_key could be enabled in __bpf_prog_enter* and disabled in * __bpf_prog_exit*. And vice versa. Check that 'start' is valid. */ if (start <= NO_START_TIME) return; duration = sched_clock() - start; stats = this_cpu_ptr(prog->stats); flags = u64_stats_update_begin_irqsave(&stats->syncp); u64_stats_inc(&stats->cnt); u64_stats_add(&stats->nsecs, duration); u64_stats_update_end_irqrestore(&stats->syncp, flags); } static __always_inline void notrace update_prog_stats(struct bpf_prog *prog, u64 start) { if (static_branch_unlikely(&bpf_stats_enabled_key)) __update_prog_stats(prog, start); } static void notrace __bpf_prog_exit_recur(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx) __releases(RCU) { bpf_reset_run_ctx(run_ctx->saved_run_ctx); update_prog_stats(prog, start); this_cpu_dec(*(prog->active)); rcu_read_unlock_migrate(); } static u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx) __acquires(RCU) { /* Runtime stats are exported via actual BPF_LSM_CGROUP * programs, not the shims. */ rcu_read_lock_dont_migrate(); run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); return NO_START_TIME; } static void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx) __releases(RCU) { bpf_reset_run_ctx(run_ctx->saved_run_ctx); rcu_read_unlock_migrate(); } u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx) { rcu_read_lock_trace(); migrate_disable(); might_fault(); run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { bpf_prog_inc_misses_counter(prog); if (prog->aux->recursion_detected) prog->aux->recursion_detected(prog); return 0; } return bpf_prog_start_time(); } void notrace __bpf_prog_exit_sleepable_recur(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx) { bpf_reset_run_ctx(run_ctx->saved_run_ctx); update_prog_stats(prog, start); this_cpu_dec(*(prog->active)); migrate_enable(); rcu_read_unlock_trace(); } static u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx) { rcu_read_lock_trace(); migrate_disable(); might_fault(); run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); return bpf_prog_start_time(); } static void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx) { bpf_reset_run_ctx(run_ctx->saved_run_ctx); update_prog_stats(prog, start); migrate_enable(); rcu_read_unlock_trace(); } static u64 notrace __bpf_prog_enter(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx) __acquires(RCU) { rcu_read_lock_dont_migrate(); run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); return bpf_prog_start_time(); } static void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx) __releases(RCU) { bpf_reset_run_ctx(run_ctx->saved_run_ctx); update_prog_stats(prog, start); rcu_read_unlock_migrate(); } void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr) { percpu_ref_get(&tr->pcref); } void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr) { percpu_ref_put(&tr->pcref); } bpf_trampoline_enter_t bpf_trampoline_enter(const struct bpf_prog *prog) { bool sleepable = prog->sleepable; if (bpf_prog_check_recur(prog)) return sleepable ? __bpf_prog_enter_sleepable_recur : __bpf_prog_enter_recur; if (resolve_prog_type(prog) == BPF_PROG_TYPE_LSM && prog->expected_attach_type == BPF_LSM_CGROUP) return __bpf_prog_enter_lsm_cgroup; return sleepable ? __bpf_prog_enter_sleepable : __bpf_prog_enter; } bpf_trampoline_exit_t bpf_trampoline_exit(const struct bpf_prog *prog) { bool sleepable = prog->sleepable; if (bpf_prog_check_recur(prog)) return sleepable ? __bpf_prog_exit_sleepable_recur : __bpf_prog_exit_recur; if (resolve_prog_type(prog) == BPF_PROG_TYPE_LSM && prog->expected_attach_type == BPF_LSM_CGROUP) return __bpf_prog_exit_lsm_cgroup; return sleepable ? __bpf_prog_exit_sleepable : __bpf_prog_exit; } int __weak arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end, const struct btf_func_model *m, u32 flags, struct bpf_tramp_links *tlinks, void *func_addr) { return -ENOTSUPP; } void * __weak arch_alloc_bpf_trampoline(unsigned int size) { void *image; if (WARN_ON_ONCE(size > PAGE_SIZE)) return NULL; image = bpf_jit_alloc_exec(PAGE_SIZE); if (image) set_vm_flush_reset_perms(image); return image; } void __weak arch_free_bpf_trampoline(void *image, unsigned int size) { WARN_ON_ONCE(size > PAGE_SIZE); /* bpf_jit_free_exec doesn't need "size", but * bpf_prog_pack_free() needs it. */ bpf_jit_free_exec(image); } int __weak arch_protect_bpf_trampoline(void *image, unsigned int size) { WARN_ON_ONCE(size > PAGE_SIZE); return set_memory_rox((long)image, 1); } int __weak arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, struct bpf_tramp_links *tlinks, void *func_addr) { return -ENOTSUPP; } static int __init init_trampolines(void) { int i; for (i = 0; i < TRAMPOLINE_TABLE_SIZE; i++) INIT_HLIST_HEAD(&trampoline_table[i]); return 0; } late_initcall(init_trampolines);
764 765 625 626 625 626 624 623 626 27 27 20 27 26 627 626 390 388 9 9 9 613 614 390 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 #include <linux/atomic.h> #include <linux/export.h> #include <linux/generic-radix-tree.h> #include <linux/gfp.h> #include <linux/kmemleak.h> /* * Returns pointer to the specified byte @offset within @radix, or NULL if not * allocated */ void *__genradix_ptr(struct __genradix *radix, size_t offset) { return __genradix_ptr_inlined(radix, offset); } EXPORT_SYMBOL(__genradix_ptr); /* * Returns pointer to the specified byte @offset within @radix, allocating it if * necessary - newly allocated slots are always zeroed out: */ void *__genradix_ptr_alloc(struct __genradix *radix, size_t offset, struct genradix_node **preallocated, gfp_t gfp_mask) { struct genradix_root *v = READ_ONCE(radix->root); struct genradix_node *n, *new_node = NULL; unsigned level; if (preallocated) swap(new_node, *preallocated); /* Increase tree depth if necessary: */ while (1) { struct genradix_root *r = v, *new_root; n = genradix_root_to_node(r); level = genradix_root_to_depth(r); if (n && ilog2(offset) < genradix_depth_shift(level)) break; if (!new_node) { new_node = genradix_alloc_node(gfp_mask); if (!new_node) return NULL; } new_node->children[0] = n; new_root = ((struct genradix_root *) ((unsigned long) new_node | (n ? level + 1 : 0))); if ((v = cmpxchg_release(&radix->root, r, new_root)) == r) { v = new_root; new_node = NULL; } else { new_node->children[0] = NULL; } } while (level--) { struct genradix_node **p = &n->children[offset >> genradix_depth_shift(level)]; offset &= genradix_depth_size(level) - 1; n = READ_ONCE(*p); if (!n) { if (!new_node) { new_node = genradix_alloc_node(gfp_mask); if (!new_node) return NULL; } if (!(n = cmpxchg_release(p, NULL, new_node))) swap(n, new_node); } } if (new_node) genradix_free_node(new_node); return &n->data[offset]; } EXPORT_SYMBOL(__genradix_ptr_alloc); void *__genradix_iter_peek(struct genradix_iter *iter, struct __genradix *radix, size_t objs_per_page) { struct genradix_root *r; struct genradix_node *n; unsigned level, i; if (iter->offset == SIZE_MAX) return NULL; restart: r = READ_ONCE(radix->root); if (!r) return NULL; n = genradix_root_to_node(r); level = genradix_root_to_depth(r); if (ilog2(iter->offset) >= genradix_depth_shift(level)) return NULL; while (level) { level--; i = (iter->offset >> genradix_depth_shift(level)) & (GENRADIX_ARY - 1); while (!n->children[i]) { size_t objs_per_ptr = genradix_depth_size(level); if (iter->offset + objs_per_ptr < iter->offset) { iter->offset = SIZE_MAX; iter->pos = SIZE_MAX; return NULL; } i++; iter->offset = round_down(iter->offset + objs_per_ptr, objs_per_ptr); iter->pos = (iter->offset >> GENRADIX_NODE_SHIFT) * objs_per_page; if (i == GENRADIX_ARY) goto restart; } n = n->children[i]; } return &n->data[iter->offset & (GENRADIX_NODE_SIZE - 1)]; } EXPORT_SYMBOL(__genradix_iter_peek); void *__genradix_iter_peek_prev(struct genradix_iter *iter, struct __genradix *radix, size_t objs_per_page, size_t obj_size_plus_page_remainder) { struct genradix_root *r; struct genradix_node *n; unsigned level, i; if (iter->offset == SIZE_MAX) return NULL; restart: r = READ_ONCE(radix->root); if (!r) return NULL; n = genradix_root_to_node(r); level = genradix_root_to_depth(r); if (ilog2(iter->offset) >= genradix_depth_shift(level)) { iter->offset = genradix_depth_size(level); iter->pos = (iter->offset >> GENRADIX_NODE_SHIFT) * objs_per_page; iter->offset -= obj_size_plus_page_remainder; iter->pos--; } while (level) { level--; i = (iter->offset >> genradix_depth_shift(level)) & (GENRADIX_ARY - 1); while (!n->children[i]) { size_t objs_per_ptr = genradix_depth_size(level); iter->offset = round_down(iter->offset, objs_per_ptr); iter->pos = (iter->offset >> GENRADIX_NODE_SHIFT) * objs_per_page; if (!iter->offset) return NULL; iter->offset -= obj_size_plus_page_remainder; iter->pos--; if (!i) goto restart; --i; } n = n->children[i]; } return &n->data[iter->offset & (GENRADIX_NODE_SIZE - 1)]; } EXPORT_SYMBOL(__genradix_iter_peek_prev); static void genradix_free_recurse(struct genradix_node *n, unsigned level) { if (level) { unsigned i; for (i = 0; i < GENRADIX_ARY; i++) if (n->children[i]) genradix_free_recurse(n->children[i], level - 1); } genradix_free_node(n); } int __genradix_prealloc(struct __genradix *radix, size_t size, gfp_t gfp_mask) { size_t offset; for (offset = 0; offset < size; offset += GENRADIX_NODE_SIZE) if (!__genradix_ptr_alloc(radix, offset, NULL, gfp_mask)) return -ENOMEM; return 0; } EXPORT_SYMBOL(__genradix_prealloc); void __genradix_free(struct __genradix *radix) { struct genradix_root *r = xchg(&radix->root, NULL); genradix_free_recurse(genradix_root_to_node(r), genradix_root_to_depth(r)); } EXPORT_SYMBOL(__genradix_free);
2 2 2 2 2 2 2 2 2 2 3 1 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 // SPDX-License-Identifier: GPL-2.0 /* * Greybus Host Device * * Copyright 2014-2015 Google Inc. * Copyright 2014-2015 Linaro Ltd. */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/greybus.h> #include "greybus_trace.h" EXPORT_TRACEPOINT_SYMBOL_GPL(gb_hd_create); EXPORT_TRACEPOINT_SYMBOL_GPL(gb_hd_release); EXPORT_TRACEPOINT_SYMBOL_GPL(gb_hd_add); EXPORT_TRACEPOINT_SYMBOL_GPL(gb_hd_del); EXPORT_TRACEPOINT_SYMBOL_GPL(gb_hd_in); EXPORT_TRACEPOINT_SYMBOL_GPL(gb_message_submit); static struct ida gb_hd_bus_id_map; int gb_hd_output(struct gb_host_device *hd, void *req, u16 size, u8 cmd, bool async) { if (!hd || !hd->driver || !hd->driver->output) return -EINVAL; return hd->driver->output(hd, req, size, cmd, async); } EXPORT_SYMBOL_GPL(gb_hd_output); static ssize_t bus_id_show(struct device *dev, struct device_attribute *attr, char *buf) { struct gb_host_device *hd = to_gb_host_device(dev); return sprintf(buf, "%d\n", hd->bus_id); } static DEVICE_ATTR_RO(bus_id); static struct attribute *bus_attrs[] = { &dev_attr_bus_id.attr, NULL }; ATTRIBUTE_GROUPS(bus); int gb_hd_cport_reserve(struct gb_host_device *hd, u16 cport_id) { struct ida *id_map = &hd->cport_id_map; int ret; ret = ida_alloc_range(id_map, cport_id, cport_id, GFP_KERNEL); if (ret < 0) { dev_err(&hd->dev, "failed to reserve cport %u\n", cport_id); return ret; } return 0; } EXPORT_SYMBOL_GPL(gb_hd_cport_reserve); void gb_hd_cport_release_reserved(struct gb_host_device *hd, u16 cport_id) { struct ida *id_map = &hd->cport_id_map; ida_free(id_map, cport_id); } EXPORT_SYMBOL_GPL(gb_hd_cport_release_reserved); /* Locking: Caller guarantees serialisation */ int gb_hd_cport_allocate(struct gb_host_device *hd, int cport_id, unsigned long flags) { struct ida *id_map = &hd->cport_id_map; int ida_start, ida_end; if (hd->driver->cport_allocate) return hd->driver->cport_allocate(hd, cport_id, flags); if (cport_id < 0) { ida_start = 0; ida_end = hd->num_cports - 1; } else if (cport_id < hd->num_cports) { ida_start = cport_id; ida_end = cport_id; } else { dev_err(&hd->dev, "cport %d not available\n", cport_id); return -EINVAL; } return ida_alloc_range(id_map, ida_start, ida_end, GFP_KERNEL); } /* Locking: Caller guarantees serialisation */ void gb_hd_cport_release(struct gb_host_device *hd, u16 cport_id) { if (hd->driver->cport_release) { hd->driver->cport_release(hd, cport_id); return; } ida_free(&hd->cport_id_map, cport_id); } static void gb_hd_release(struct device *dev) { struct gb_host_device *hd = to_gb_host_device(dev); trace_gb_hd_release(hd); if (hd->svc) gb_svc_put(hd->svc); ida_free(&gb_hd_bus_id_map, hd->bus_id); ida_destroy(&hd->cport_id_map); kfree(hd); } const struct device_type greybus_hd_type = { .name = "greybus_host_device", .release = gb_hd_release, }; struct gb_host_device *gb_hd_create(struct gb_hd_driver *driver, struct device *parent, size_t buffer_size_max, size_t num_cports) { struct gb_host_device *hd; int ret; /* * Validate that the driver implements all of the callbacks * so that we don't have to every time we make them. */ if ((!driver->message_send) || (!driver->message_cancel)) { dev_err(parent, "mandatory hd-callbacks missing\n"); return ERR_PTR(-EINVAL); } if (buffer_size_max < GB_OPERATION_MESSAGE_SIZE_MIN) { dev_err(parent, "greybus host-device buffers too small\n"); return ERR_PTR(-EINVAL); } if (num_cports == 0 || num_cports > CPORT_ID_MAX + 1) { dev_err(parent, "Invalid number of CPorts: %zu\n", num_cports); return ERR_PTR(-EINVAL); } /* * Make sure to never allocate messages larger than what the Greybus * protocol supports. */ if (buffer_size_max > GB_OPERATION_MESSAGE_SIZE_MAX) { dev_warn(parent, "limiting buffer size to %u\n", GB_OPERATION_MESSAGE_SIZE_MAX); buffer_size_max = GB_OPERATION_MESSAGE_SIZE_MAX; } hd = kzalloc(sizeof(*hd) + driver->hd_priv_size, GFP_KERNEL); if (!hd) return ERR_PTR(-ENOMEM); ret = ida_alloc_min(&gb_hd_bus_id_map, 1, GFP_KERNEL); if (ret < 0) { kfree(hd); return ERR_PTR(ret); } hd->bus_id = ret; hd->driver = driver; INIT_LIST_HEAD(&hd->modules); INIT_LIST_HEAD(&hd->connections); ida_init(&hd->cport_id_map); hd->buffer_size_max = buffer_size_max; hd->num_cports = num_cports; hd->dev.parent = parent; hd->dev.bus = &greybus_bus_type; hd->dev.type = &greybus_hd_type; hd->dev.groups = bus_groups; hd->dev.dma_mask = hd->dev.parent->dma_mask; device_initialize(&hd->dev); dev_set_name(&hd->dev, "greybus%d", hd->bus_id); trace_gb_hd_create(hd); hd->svc = gb_svc_create(hd); if (!hd->svc) { dev_err(&hd->dev, "failed to create svc\n"); put_device(&hd->dev); return ERR_PTR(-ENOMEM); } return hd; } EXPORT_SYMBOL_GPL(gb_hd_create); int gb_hd_add(struct gb_host_device *hd) { int ret; ret = device_add(&hd->dev); if (ret) return ret; ret = gb_svc_add(hd->svc); if (ret) { device_del(&hd->dev); return ret; } trace_gb_hd_add(hd); return 0; } EXPORT_SYMBOL_GPL(gb_hd_add); void gb_hd_del(struct gb_host_device *hd) { trace_gb_hd_del(hd); /* * Tear down the svc and flush any on-going hotplug processing before * removing the remaining interfaces. */ gb_svc_del(hd->svc); device_del(&hd->dev); } EXPORT_SYMBOL_GPL(gb_hd_del); void gb_hd_shutdown(struct gb_host_device *hd) { gb_svc_del(hd->svc); } EXPORT_SYMBOL_GPL(gb_hd_shutdown); void gb_hd_put(struct gb_host_device *hd) { put_device(&hd->dev); } EXPORT_SYMBOL_GPL(gb_hd_put); int __init gb_hd_init(void) { ida_init(&gb_hd_bus_id_map); return 0; } void gb_hd_exit(void) { ida_destroy(&gb_hd_bus_id_map); }
40 57 26 1 107 105 105 78 105 21 104 55 3 7 40 102 84 78 64 102 100 86 70 71 5 107 108 90 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/isofs/namei.c * * (C) 1992 Eric Youngdale Modified for ISO 9660 filesystem. * * (C) 1991 Linus Torvalds - minix filesystem */ #include <linux/gfp.h> #include "isofs.h" /* * ok, we cannot use strncmp, as the name is not in our data space. * Thus we'll have to use isofs_match. No big problem. Match also makes * some sanity tests. */ static int isofs_cmp(struct dentry *dentry, const char *compare, int dlen) { struct qstr qstr; qstr.name = compare; qstr.len = dlen; if (likely(!dentry->d_op)) return dentry->d_name.len != dlen || memcmp(dentry->d_name.name, compare, dlen); return dentry->d_op->d_compare(NULL, dentry->d_name.len, dentry->d_name.name, &qstr); } /* * isofs_find_entry() * * finds an entry in the specified directory with the wanted name. It * returns the inode number of the found entry, or 0 on error. */ static unsigned long isofs_find_entry(struct inode *dir, struct dentry *dentry, unsigned long *block_rv, unsigned long *offset_rv, char *tmpname, struct iso_directory_record *tmpde) { unsigned long bufsize = ISOFS_BUFFER_SIZE(dir); unsigned char bufbits = ISOFS_BUFFER_BITS(dir); unsigned long block, f_pos, offset, block_saved, offset_saved; struct buffer_head *bh = NULL; struct isofs_sb_info *sbi = ISOFS_SB(dir->i_sb); if (!ISOFS_I(dir)->i_first_extent) return 0; f_pos = 0; offset = 0; block = 0; while (f_pos < dir->i_size) { struct iso_directory_record *de; int de_len, match, i, dlen; char *dpnt; if (!bh) { bh = isofs_bread(dir, block); if (!bh) return 0; } de = (struct iso_directory_record *) (bh->b_data + offset); de_len = *(unsigned char *) de; if (!de_len) { brelse(bh); bh = NULL; f_pos = (f_pos + ISOFS_BLOCK_SIZE) & ~(ISOFS_BLOCK_SIZE - 1); block = f_pos >> bufbits; offset = 0; continue; } block_saved = bh->b_blocknr; offset_saved = offset; offset += de_len; f_pos += de_len; /* Make sure we have a full directory entry */ if (offset >= bufsize) { int slop = bufsize - offset + de_len; memcpy(tmpde, de, slop); offset &= bufsize - 1; block++; brelse(bh); bh = NULL; if (offset) { bh = isofs_bread(dir, block); if (!bh) return 0; memcpy((void *) tmpde + slop, bh->b_data, offset); } de = tmpde; } dlen = de->name_len[0]; dpnt = de->name; /* Basic sanity check, whether name doesn't exceed dir entry */ if (de_len < dlen + sizeof(struct iso_directory_record)) { printk(KERN_NOTICE "iso9660: Corrupted directory entry" " in block %lu of inode %lu\n", block, dir->i_ino); brelse(bh); return 0; } if (sbi->s_rock && ((i = get_rock_ridge_filename(de, tmpname, dir)))) { dlen = i; /* possibly -1 */ dpnt = tmpname; #ifdef CONFIG_JOLIET } else if (sbi->s_joliet_level) { dlen = get_joliet_filename(de, tmpname, dir); dpnt = tmpname; #endif } else if (sbi->s_mapping == 'a') { dlen = get_acorn_filename(de, tmpname, dir); dpnt = tmpname; } else if (sbi->s_mapping == 'n') { dlen = isofs_name_translate(de, tmpname, dir); dpnt = tmpname; } /* * Skip hidden or associated files unless hide or showassoc, * respectively, is set */ match = 0; if (dlen > 0 && (!sbi->s_hide || (!(de->flags[-sbi->s_high_sierra] & 1))) && (sbi->s_showassoc || (!(de->flags[-sbi->s_high_sierra] & 4)))) { if (dpnt && (dlen > 1 || dpnt[0] > 1)) match = (isofs_cmp(dentry, dpnt, dlen) == 0); } if (match) { isofs_normalize_block_and_offset(de, &block_saved, &offset_saved); *block_rv = block_saved; *offset_rv = offset_saved; brelse(bh); return 1; } } brelse(bh); return 0; } struct dentry *isofs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { int found; unsigned long block; unsigned long offset; struct inode *inode; struct page *page; page = alloc_page(GFP_USER); if (!page) return ERR_PTR(-ENOMEM); found = isofs_find_entry(dir, dentry, &block, &offset, page_address(page), 1024 + page_address(page)); __free_page(page); inode = found ? isofs_iget(dir->i_sb, block, offset) : NULL; return d_splice_alias(inode, dentry); }
210 2322 1391 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PID_NS_H #define _LINUX_PID_NS_H #include <linux/sched.h> #include <linux/bug.h> #include <linux/mm.h> #include <linux/workqueue.h> #include <linux/threads.h> #include <linux/nsproxy.h> #include <linux/ns_common.h> #include <linux/idr.h> /* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ #define MAX_PID_NS_LEVEL 32 struct fs_pin; #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) /* modes for vm.memfd_noexec sysctl */ #define MEMFD_NOEXEC_SCOPE_EXEC 0 /* MFD_EXEC implied if unset */ #define MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL 1 /* MFD_NOEXEC_SEAL implied if unset */ #define MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED 2 /* same as 1, except MFD_EXEC rejected */ #endif struct pid_namespace { struct idr idr; struct rcu_head rcu; unsigned int pid_allocated; struct task_struct *child_reaper; struct kmem_cache *pid_cachep; unsigned int level; int pid_max; struct pid_namespace *parent; #ifdef CONFIG_BSD_PROCESS_ACCT struct fs_pin *bacct; #endif struct user_namespace *user_ns; struct ucounts *ucounts; int reboot; /* group exit code if this pidns was rebooted */ struct ns_common ns; struct work_struct work; #ifdef CONFIG_SYSCTL struct ctl_table_set set; struct ctl_table_header *sysctls; #if defined(CONFIG_MEMFD_CREATE) int memfd_noexec_scope; #endif #endif } __randomize_layout; extern struct pid_namespace init_pid_ns; #define PIDNS_ADDING (1U << 31) #ifdef CONFIG_PID_NS static inline struct pid_namespace *to_pid_ns(struct ns_common *ns) { return container_of(ns, struct pid_namespace, ns); } static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns) { if (ns != &init_pid_ns) ns_ref_inc(ns); return ns; } #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns) { int scope = MEMFD_NOEXEC_SCOPE_EXEC; for (; ns; ns = ns->parent) scope = max(scope, READ_ONCE(ns->memfd_noexec_scope)); return scope; } #else static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns) { return 0; } #endif extern struct pid_namespace *copy_pid_ns(u64 flags, struct user_namespace *user_ns, struct pid_namespace *ns); extern void zap_pid_ns_processes(struct pid_namespace *pid_ns); extern int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd); extern void put_pid_ns(struct pid_namespace *ns); extern bool pidns_is_ancestor(struct pid_namespace *child, struct pid_namespace *ancestor); #else /* !CONFIG_PID_NS */ #include <linux/err.h> static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns) { return ns; } static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns) { return 0; } static inline struct pid_namespace *copy_pid_ns(u64 flags, struct user_namespace *user_ns, struct pid_namespace *ns) { if (flags & CLONE_NEWPID) ns = ERR_PTR(-EINVAL); return ns; } static inline void put_pid_ns(struct pid_namespace *ns) { } static inline void zap_pid_ns_processes(struct pid_namespace *ns) { BUG(); } static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) { return 0; } static inline bool pidns_is_ancestor(struct pid_namespace *child, struct pid_namespace *ancestor) { return false; } #endif /* CONFIG_PID_NS */ extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk); void pidhash_init(void); void pid_idr_init(void); int register_pidns_sysctls(struct pid_namespace *pidns); void unregister_pidns_sysctls(struct pid_namespace *pidns); static inline bool task_is_in_init_pid_ns(struct task_struct *tsk) { return task_active_pid_ns(tsk) == &init_pid_ns; } #endif /* _LINUX_PID_NS_H */
109 12 2 127 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM snd_pcm #define TRACE_INCLUDE_FILE pcm_trace #if !defined(_PCM_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) #define _PCM_TRACE_H #include <linux/tracepoint.h> TRACE_EVENT(hwptr, TP_PROTO(struct snd_pcm_substream *substream, snd_pcm_uframes_t pos, bool irq), TP_ARGS(substream, pos, irq), TP_STRUCT__entry( __field( bool, in_interrupt ) __field( unsigned int, card ) __field( unsigned int, device ) __field( unsigned int, number ) __field( unsigned int, stream ) __field( snd_pcm_uframes_t, pos ) __field( snd_pcm_uframes_t, period_size ) __field( snd_pcm_uframes_t, buffer_size ) __field( snd_pcm_uframes_t, old_hw_ptr ) __field( snd_pcm_uframes_t, hw_ptr_base ) ), TP_fast_assign( __entry->in_interrupt = (irq); __entry->card = (substream)->pcm->card->number; __entry->device = (substream)->pcm->device; __entry->number = (substream)->number; __entry->stream = (substream)->stream; __entry->pos = (pos); __entry->period_size = (substream)->runtime->period_size; __entry->buffer_size = (substream)->runtime->buffer_size; __entry->old_hw_ptr = (substream)->runtime->status->hw_ptr; __entry->hw_ptr_base = (substream)->runtime->hw_ptr_base; ), TP_printk("pcmC%dD%d%s/sub%d: %s: pos=%lu, old=%lu, base=%lu, period=%lu, buf=%lu", __entry->card, __entry->device, __entry->stream == SNDRV_PCM_STREAM_PLAYBACK ? "p" : "c", __entry->number, __entry->in_interrupt ? "IRQ" : "POS", (unsigned long)__entry->pos, (unsigned long)__entry->old_hw_ptr, (unsigned long)__entry->hw_ptr_base, (unsigned long)__entry->period_size, (unsigned long)__entry->buffer_size) ); TRACE_EVENT(xrun, TP_PROTO(struct snd_pcm_substream *substream), TP_ARGS(substream), TP_STRUCT__entry( __field( unsigned int, card ) __field( unsigned int, device ) __field( unsigned int, number ) __field( unsigned int, stream ) __field( snd_pcm_uframes_t, period_size ) __field( snd_pcm_uframes_t, buffer_size ) __field( snd_pcm_uframes_t, old_hw_ptr ) __field( snd_pcm_uframes_t, hw_ptr_base ) ), TP_fast_assign( __entry->card = (substream)->pcm->card->number; __entry->device = (substream)->pcm->device; __entry->number = (substream)->number; __entry->stream = (substream)->stream; __entry->period_size = (substream)->runtime->period_size; __entry->buffer_size = (substream)->runtime->buffer_size; __entry->old_hw_ptr = (substream)->runtime->status->hw_ptr; __entry->hw_ptr_base = (substream)->runtime->hw_ptr_base; ), TP_printk("pcmC%dD%d%s/sub%d: XRUN: old=%lu, base=%lu, period=%lu, buf=%lu", __entry->card, __entry->device, __entry->stream == SNDRV_PCM_STREAM_PLAYBACK ? "p" : "c", __entry->number, (unsigned long)__entry->old_hw_ptr, (unsigned long)__entry->hw_ptr_base, (unsigned long)__entry->period_size, (unsigned long)__entry->buffer_size) ); TRACE_EVENT(hw_ptr_error, TP_PROTO(struct snd_pcm_substream *substream, const char *why), TP_ARGS(substream, why), TP_STRUCT__entry( __field( unsigned int, card ) __field( unsigned int, device ) __field( unsigned int, number ) __field( unsigned int, stream ) __string( reason, why ) ), TP_fast_assign( __entry->card = (substream)->pcm->card->number; __entry->device = (substream)->pcm->device; __entry->number = (substream)->number; __entry->stream = (substream)->stream; __assign_str(reason); ), TP_printk("pcmC%dD%d%s/sub%d: ERROR: %s", __entry->card, __entry->device, __entry->stream == SNDRV_PCM_STREAM_PLAYBACK ? "p" : "c", __entry->number, __get_str(reason)) ); TRACE_EVENT(applptr, TP_PROTO(struct snd_pcm_substream *substream, snd_pcm_uframes_t prev, snd_pcm_uframes_t curr), TP_ARGS(substream, prev, curr), TP_STRUCT__entry( __field( unsigned int, card ) __field( unsigned int, device ) __field( unsigned int, number ) __field( unsigned int, stream ) __field( snd_pcm_uframes_t, prev ) __field( snd_pcm_uframes_t, curr ) __field( snd_pcm_uframes_t, avail ) __field( snd_pcm_uframes_t, period_size ) __field( snd_pcm_uframes_t, buffer_size ) ), TP_fast_assign( __entry->card = (substream)->pcm->card->number; __entry->device = (substream)->pcm->device; __entry->number = (substream)->number; __entry->stream = (substream)->stream; __entry->prev = (prev); __entry->curr = (curr); __entry->avail = (substream)->stream ? snd_pcm_capture_avail(substream->runtime) : snd_pcm_playback_avail(substream->runtime); __entry->period_size = (substream)->runtime->period_size; __entry->buffer_size = (substream)->runtime->buffer_size; ), TP_printk("pcmC%dD%d%s/sub%d: prev=%lu, curr=%lu, avail=%lu, period=%lu, buf=%lu", __entry->card, __entry->device, __entry->stream ? "c" : "p", __entry->number, __entry->prev, __entry->curr, __entry->avail, __entry->period_size, __entry->buffer_size ) ); #endif /* _PCM_TRACE_H */ /* This part must be outside protection */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . #include <trace/define_trace.h>
15 1 15 12 12 11 11 2 11 11 12 12 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 // SPDX-License-Identifier: GPL-2.0-only /* * * Copyright (C) 2005 Mike Isely <isely@pobox.com> */ #include "pvrusb2-context.h" #include "pvrusb2-io.h" #include "pvrusb2-ioread.h" #include "pvrusb2-hdw.h" #include "pvrusb2-debug.h" #include <linux/wait.h> #include <linux/kthread.h> #include <linux/errno.h> #include <linux/string.h> #include <linux/slab.h> static struct pvr2_context *pvr2_context_exist_first; static struct pvr2_context *pvr2_context_exist_last; static struct pvr2_context *pvr2_context_notify_first; static struct pvr2_context *pvr2_context_notify_last; static DEFINE_MUTEX(pvr2_context_mutex); static DECLARE_WAIT_QUEUE_HEAD(pvr2_context_sync_data); static DECLARE_WAIT_QUEUE_HEAD(pvr2_context_cleanup_data); static int pvr2_context_cleanup_flag; static int pvr2_context_cleaned_flag; static struct task_struct *pvr2_context_thread_ptr; static void pvr2_context_set_notify(struct pvr2_context *mp, int fl) { int signal_flag = 0; mutex_lock(&pvr2_context_mutex); if (fl) { if (!mp->notify_flag) { signal_flag = (pvr2_context_notify_first == NULL); mp->notify_prev = pvr2_context_notify_last; mp->notify_next = NULL; pvr2_context_notify_last = mp; if (mp->notify_prev) { mp->notify_prev->notify_next = mp; } else { pvr2_context_notify_first = mp; } mp->notify_flag = !0; } } else { if (mp->notify_flag) { mp->notify_flag = 0; if (mp->notify_next) { mp->notify_next->notify_prev = mp->notify_prev; } else { pvr2_context_notify_last = mp->notify_prev; } if (mp->notify_prev) { mp->notify_prev->notify_next = mp->notify_next; } else { pvr2_context_notify_first = mp->notify_next; } } } mutex_unlock(&pvr2_context_mutex); if (signal_flag) wake_up(&pvr2_context_sync_data); } static void pvr2_context_destroy(struct pvr2_context *mp) { pvr2_trace(PVR2_TRACE_CTXT,"pvr2_context %p (destroy)",mp); pvr2_hdw_destroy(mp->hdw); pvr2_context_set_notify(mp, 0); mutex_lock(&pvr2_context_mutex); if (mp->exist_next) { mp->exist_next->exist_prev = mp->exist_prev; } else { pvr2_context_exist_last = mp->exist_prev; } if (mp->exist_prev) { mp->exist_prev->exist_next = mp->exist_next; } else { pvr2_context_exist_first = mp->exist_next; } if (!pvr2_context_exist_first) { /* Trigger wakeup on control thread in case it is waiting for an exit condition. */ wake_up(&pvr2_context_sync_data); } mutex_unlock(&pvr2_context_mutex); kfree(mp); } static void pvr2_context_notify(void *ptr) { struct pvr2_context *mp = ptr; pvr2_context_set_notify(mp,!0); } static void pvr2_context_check(struct pvr2_context *mp) { struct pvr2_channel *ch1, *ch2; pvr2_trace(PVR2_TRACE_CTXT, "pvr2_context %p (notify)", mp); if (!mp->initialized_flag && !mp->disconnect_flag) { mp->initialized_flag = !0; pvr2_trace(PVR2_TRACE_CTXT, "pvr2_context %p (initialize)", mp); /* Finish hardware initialization */ if (pvr2_hdw_initialize(mp->hdw, pvr2_context_notify, mp)) { mp->video_stream.stream = pvr2_hdw_get_video_stream(mp->hdw); /* Trigger interface initialization. By doing this here initialization runs in our own safe and cozy thread context. */ if (mp->setup_func) mp->setup_func(mp); } else { pvr2_trace(PVR2_TRACE_CTXT, "pvr2_context %p (thread skipping setup)", mp); /* Even though initialization did not succeed, we're still going to continue anyway. We need to do this in order to await the expected disconnect (which we will detect in the normal course of operation). */ } } for (ch1 = mp->mc_first; ch1; ch1 = ch2) { ch2 = ch1->mc_next; if (ch1->check_func) ch1->check_func(ch1); } if (mp->disconnect_flag && !mp->mc_first) { /* Go away... */ pvr2_context_destroy(mp); return; } } static int pvr2_context_shutok(void) { return pvr2_context_cleanup_flag && (pvr2_context_exist_first == NULL); } static int pvr2_context_thread_func(void *foo) { struct pvr2_context *mp; pvr2_trace(PVR2_TRACE_CTXT,"pvr2_context thread start"); do { while ((mp = pvr2_context_notify_first) != NULL) { pvr2_context_set_notify(mp, 0); pvr2_context_check(mp); } wait_event_interruptible( pvr2_context_sync_data, ((pvr2_context_notify_first != NULL) || pvr2_context_shutok())); } while (!pvr2_context_shutok()); pvr2_context_cleaned_flag = !0; wake_up(&pvr2_context_cleanup_data); pvr2_trace(PVR2_TRACE_CTXT,"pvr2_context thread cleaned up"); wait_event_interruptible( pvr2_context_sync_data, kthread_should_stop()); pvr2_trace(PVR2_TRACE_CTXT,"pvr2_context thread end"); return 0; } int pvr2_context_global_init(void) { pvr2_context_thread_ptr = kthread_run(pvr2_context_thread_func, NULL, "pvrusb2-context"); return IS_ERR(pvr2_context_thread_ptr) ? -ENOMEM : 0; } void pvr2_context_global_done(void) { pvr2_context_cleanup_flag = !0; wake_up(&pvr2_context_sync_data); wait_event_interruptible( pvr2_context_cleanup_data, pvr2_context_cleaned_flag); kthread_stop(pvr2_context_thread_ptr); } struct pvr2_context *pvr2_context_create( struct usb_interface *intf, const struct usb_device_id *devid, void (*setup_func)(struct pvr2_context *)) { struct pvr2_context *mp = NULL; mp = kzalloc(sizeof(*mp),GFP_KERNEL); if (!mp) goto done; pvr2_trace(PVR2_TRACE_CTXT,"pvr2_context %p (create)",mp); mp->setup_func = setup_func; mutex_init(&mp->mutex); mutex_lock(&pvr2_context_mutex); mp->exist_prev = pvr2_context_exist_last; mp->exist_next = NULL; pvr2_context_exist_last = mp; if (mp->exist_prev) { mp->exist_prev->exist_next = mp; } else { pvr2_context_exist_first = mp; } mutex_unlock(&pvr2_context_mutex); mp->hdw = pvr2_hdw_create(intf,devid); if (!mp->hdw) { pvr2_context_destroy(mp); mp = NULL; goto done; } pvr2_context_set_notify(mp, !0); done: return mp; } static void pvr2_context_reset_input_limits(struct pvr2_context *mp) { unsigned int tmsk,mmsk; struct pvr2_channel *cp; struct pvr2_hdw *hdw = mp->hdw; mmsk = pvr2_hdw_get_input_available(hdw); tmsk = mmsk; for (cp = mp->mc_first; cp; cp = cp->mc_next) { if (!cp->input_mask) continue; tmsk &= cp->input_mask; } pvr2_hdw_set_input_allowed(hdw,mmsk,tmsk); pvr2_hdw_commit_ctl(hdw); } static void pvr2_context_enter(struct pvr2_context *mp) { mutex_lock(&mp->mutex); } static void pvr2_context_exit(struct pvr2_context *mp) { int destroy_flag = 0; if (!(mp->mc_first || !mp->disconnect_flag)) { destroy_flag = !0; } mutex_unlock(&mp->mutex); if (destroy_flag) pvr2_context_notify(mp); } void pvr2_context_disconnect(struct pvr2_context *mp) { pvr2_hdw_disconnect(mp->hdw); if (!pvr2_context_shutok()) pvr2_context_notify(mp); mp->disconnect_flag = !0; } void pvr2_channel_init(struct pvr2_channel *cp,struct pvr2_context *mp) { pvr2_context_enter(mp); cp->hdw = mp->hdw; cp->mc_head = mp; cp->mc_next = NULL; cp->mc_prev = mp->mc_last; if (mp->mc_last) { mp->mc_last->mc_next = cp; } else { mp->mc_first = cp; } mp->mc_last = cp; pvr2_context_exit(mp); } static void pvr2_channel_disclaim_stream(struct pvr2_channel *cp) { if (!cp->stream) return; pvr2_stream_kill(cp->stream->stream); cp->stream->user = NULL; cp->stream = NULL; } void pvr2_channel_done(struct pvr2_channel *cp) { struct pvr2_context *mp = cp->mc_head; pvr2_context_enter(mp); cp->input_mask = 0; pvr2_channel_disclaim_stream(cp); pvr2_context_reset_input_limits(mp); if (cp->mc_next) { cp->mc_next->mc_prev = cp->mc_prev; } else { mp->mc_last = cp->mc_prev; } if (cp->mc_prev) { cp->mc_prev->mc_next = cp->mc_next; } else { mp->mc_first = cp->mc_next; } cp->hdw = NULL; pvr2_context_exit(mp); } int pvr2_channel_limit_inputs(struct pvr2_channel *cp,unsigned int cmsk) { unsigned int tmsk,mmsk; int ret = 0; struct pvr2_channel *p2; struct pvr2_hdw *hdw = cp->hdw; mmsk = pvr2_hdw_get_input_available(hdw); cmsk &= mmsk; if (cmsk == cp->input_mask) { /* No change; nothing to do */ return 0; } pvr2_context_enter(cp->mc_head); do { if (!cmsk) { cp->input_mask = 0; pvr2_context_reset_input_limits(cp->mc_head); break; } tmsk = mmsk; for (p2 = cp->mc_head->mc_first; p2; p2 = p2->mc_next) { if (p2 == cp) continue; if (!p2->input_mask) continue; tmsk &= p2->input_mask; } if (!(tmsk & cmsk)) { ret = -EPERM; break; } tmsk &= cmsk; if ((ret = pvr2_hdw_set_input_allowed(hdw,mmsk,tmsk)) != 0) { /* Internal failure changing allowed list; probably should not happen, but react if it does. */ break; } cp->input_mask = cmsk; pvr2_hdw_commit_ctl(hdw); } while (0); pvr2_context_exit(cp->mc_head); return ret; } unsigned int pvr2_channel_get_limited_inputs(struct pvr2_channel *cp) { return cp->input_mask; } int pvr2_channel_claim_stream(struct pvr2_channel *cp, struct pvr2_context_stream *sp) { int code = 0; pvr2_context_enter(cp->mc_head); do { if (sp == cp->stream) break; if (sp && sp->user) { code = -EBUSY; break; } pvr2_channel_disclaim_stream(cp); if (!sp) break; sp->user = cp; cp->stream = sp; } while (0); pvr2_context_exit(cp->mc_head); return code; } // This is the marker for the real beginning of a legitimate mpeg2 stream. static char stream_sync_key[] = { 0x00, 0x00, 0x01, 0xba, }; struct pvr2_ioread *pvr2_channel_create_mpeg_stream( struct pvr2_context_stream *sp) { struct pvr2_ioread *cp; cp = pvr2_ioread_create(); if (!cp) return NULL; pvr2_ioread_setup(cp,sp->stream); pvr2_ioread_set_sync_key(cp,stream_sync_key,sizeof(stream_sync_key)); return cp; }
5 1 1 3 3 1 1 2 3 1 2 2 1 1 8 6 2 1 7 5 2 3 5 3 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 // SPDX-License-Identifier: GPL-2.0 /* * USB HandSpring Visor, Palm m50x, and Sony Clie driver * (supports all of the Palm OS USB devices) * * Copyright (C) 1999 - 2004 * Greg Kroah-Hartman (greg@kroah.com) * * See Documentation/usb/usb-serial.rst for more information on using this * driver * */ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/tty.h> #include <linux/tty_driver.h> #include <linux/tty_flip.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/spinlock.h> #include <linux/uaccess.h> #include <linux/usb.h> #include <linux/usb/serial.h> #include <linux/usb/cdc.h> #include "visor.h" /* * Version Information */ #define DRIVER_AUTHOR "Greg Kroah-Hartman <greg@kroah.com>" #define DRIVER_DESC "USB HandSpring Visor / Palm OS driver" /* function prototypes for a handspring visor */ static int visor_open(struct tty_struct *tty, struct usb_serial_port *port); static void visor_close(struct usb_serial_port *port); static int visor_probe(struct usb_serial *serial, const struct usb_device_id *id); static int visor_calc_num_ports(struct usb_serial *serial, struct usb_serial_endpoints *epds); static int clie_5_calc_num_ports(struct usb_serial *serial, struct usb_serial_endpoints *epds); static void visor_read_int_callback(struct urb *urb); static int clie_3_5_startup(struct usb_serial *serial); static int palm_os_3_probe(struct usb_serial *serial, const struct usb_device_id *id); static int palm_os_4_probe(struct usb_serial *serial, const struct usb_device_id *id); static const struct usb_device_id id_table[] = { { USB_DEVICE(HANDSPRING_VENDOR_ID, HANDSPRING_VISOR_ID), .driver_info = (kernel_ulong_t)&palm_os_3_probe }, { USB_DEVICE(HANDSPRING_VENDOR_ID, HANDSPRING_TREO_ID), .driver_info = (kernel_ulong_t)&palm_os_4_probe }, { USB_DEVICE(HANDSPRING_VENDOR_ID, HANDSPRING_TREO600_ID), .driver_info = (kernel_ulong_t)&palm_os_4_probe }, { USB_DEVICE(GSPDA_VENDOR_ID, GSPDA_XPLORE_M68_ID), .driver_info = (kernel_ulong_t)&palm_os_4_probe }, { USB_DEVICE(PALM_VENDOR_ID, PALM_M500_ID), .driver_info = (kernel_ulong_t)&palm_os_4_probe }, { USB_DEVICE(PALM_VENDOR_ID, PALM_M505_ID), .driver_info = (kernel_ulong_t)&palm_os_4_probe }, { USB_DEVICE(PALM_VENDOR_ID, PALM_M515_ID), .driver_info = (kernel_ulong_t)&palm_os_4_probe }, { USB_DEVICE(PALM_VENDOR_ID, PALM_I705_ID), .driver_info = (kernel_ulong_t)&palm_os_4_probe }, { USB_DEVICE(PALM_VENDOR_ID, PALM_M100_ID), .driver_info = (kernel_ulong_t)&palm_os_4_probe }, { USB_DEVICE(PALM_VENDOR_ID, PALM_M125_ID), .driver_info = (kernel_ulong_t)&palm_os_4_probe }, { USB_DEVICE(PALM_VENDOR_ID, PALM_M130_ID), .driver_info = (kernel_ulong_t)&palm_os_4_probe }, { USB_DEVICE(PALM_VENDOR_ID, PALM_TUNGSTEN_T_ID), .driver_info = (kernel_ulong_t)&palm_os_4_probe }, { USB_DEVICE(PALM_VENDOR_ID, PALM_TREO_650), .driver_info = (kernel_ulong_t)&palm_os_4_probe }, { USB_DEVICE(PALM_VENDOR_ID, PALM_TUNGSTEN_Z_ID), .driver_info = (kernel_ulong_t)&palm_os_4_probe }, { USB_DEVICE(PALM_VENDOR_ID, PALM_ZIRE_ID), .driver_info = (kernel_ulong_t)&palm_os_4_probe }, { USB_DEVICE(SONY_VENDOR_ID, SONY_CLIE_4_0_ID), .driver_info = (kernel_ulong_t)&palm_os_4_probe }, { USB_DEVICE(SONY_VENDOR_ID, SONY_CLIE_S360_ID), .driver_info = (kernel_ulong_t)&palm_os_4_probe }, { USB_DEVICE(SONY_VENDOR_ID, SONY_CLIE_4_1_ID), .driver_info = (kernel_ulong_t)&palm_os_4_probe }, { USB_DEVICE(SONY_VENDOR_ID, SONY_CLIE_NX60_ID), .driver_info = (kernel_ulong_t)&palm_os_4_probe }, { USB_DEVICE(SONY_VENDOR_ID, SONY_CLIE_NZ90V_ID), .driver_info = (kernel_ulong_t)&palm_os_4_probe }, { USB_DEVICE(SONY_VENDOR_ID, SONY_CLIE_TJ25_ID), .driver_info = (kernel_ulong_t)&palm_os_4_probe }, { USB_DEVICE(ACER_VENDOR_ID, ACER_S10_ID), .driver_info = (kernel_ulong_t)&palm_os_4_probe }, { USB_DEVICE_INTERFACE_CLASS(SAMSUNG_VENDOR_ID, SAMSUNG_SCH_I330_ID, 0xff), .driver_info = (kernel_ulong_t)&palm_os_4_probe }, { USB_DEVICE(SAMSUNG_VENDOR_ID, SAMSUNG_SPH_I500_ID), .driver_info = (kernel_ulong_t)&palm_os_4_probe }, { USB_DEVICE(TAPWAVE_VENDOR_ID, TAPWAVE_ZODIAC_ID), .driver_info = (kernel_ulong_t)&palm_os_4_probe }, { USB_DEVICE(GARMIN_VENDOR_ID, GARMIN_IQUE_3600_ID), .driver_info = (kernel_ulong_t)&palm_os_4_probe }, { USB_DEVICE(ACEECA_VENDOR_ID, ACEECA_MEZ1000_ID), .driver_info = (kernel_ulong_t)&palm_os_4_probe }, { USB_DEVICE(KYOCERA_VENDOR_ID, KYOCERA_7135_ID), .driver_info = (kernel_ulong_t)&palm_os_4_probe }, { USB_DEVICE(FOSSIL_VENDOR_ID, FOSSIL_ABACUS_ID), .driver_info = (kernel_ulong_t)&palm_os_4_probe }, { } /* Terminating entry */ }; static const struct usb_device_id clie_id_5_table[] = { { USB_DEVICE(SONY_VENDOR_ID, SONY_CLIE_UX50_ID), .driver_info = (kernel_ulong_t)&palm_os_4_probe }, { } /* Terminating entry */ }; static const struct usb_device_id clie_id_3_5_table[] = { { USB_DEVICE(SONY_VENDOR_ID, SONY_CLIE_3_5_ID) }, { } /* Terminating entry */ }; static const struct usb_device_id id_table_combined[] = { { USB_DEVICE(HANDSPRING_VENDOR_ID, HANDSPRING_VISOR_ID) }, { USB_DEVICE(HANDSPRING_VENDOR_ID, HANDSPRING_TREO_ID) }, { USB_DEVICE(HANDSPRING_VENDOR_ID, HANDSPRING_TREO600_ID) }, { USB_DEVICE(GSPDA_VENDOR_ID, GSPDA_XPLORE_M68_ID) }, { USB_DEVICE(PALM_VENDOR_ID, PALM_M500_ID) }, { USB_DEVICE(PALM_VENDOR_ID, PALM_M505_ID) }, { USB_DEVICE(PALM_VENDOR_ID, PALM_M515_ID) }, { USB_DEVICE(PALM_VENDOR_ID, PALM_I705_ID) }, { USB_DEVICE(PALM_VENDOR_ID, PALM_M100_ID) }, { USB_DEVICE(PALM_VENDOR_ID, PALM_M125_ID) }, { USB_DEVICE(PALM_VENDOR_ID, PALM_M130_ID) }, { USB_DEVICE(PALM_VENDOR_ID, PALM_TUNGSTEN_T_ID) }, { USB_DEVICE(PALM_VENDOR_ID, PALM_TREO_650) }, { USB_DEVICE(PALM_VENDOR_ID, PALM_TUNGSTEN_Z_ID) }, { USB_DEVICE(PALM_VENDOR_ID, PALM_ZIRE_ID) }, { USB_DEVICE(SONY_VENDOR_ID, SONY_CLIE_3_5_ID) }, { USB_DEVICE(SONY_VENDOR_ID, SONY_CLIE_4_0_ID) }, { USB_DEVICE(SONY_VENDOR_ID, SONY_CLIE_S360_ID) }, { USB_DEVICE(SONY_VENDOR_ID, SONY_CLIE_4_1_ID) }, { USB_DEVICE(SONY_VENDOR_ID, SONY_CLIE_NX60_ID) }, { USB_DEVICE(SONY_VENDOR_ID, SONY_CLIE_NZ90V_ID) }, { USB_DEVICE(SONY_VENDOR_ID, SONY_CLIE_UX50_ID) }, { USB_DEVICE(SONY_VENDOR_ID, SONY_CLIE_TJ25_ID) }, { USB_DEVICE(SAMSUNG_VENDOR_ID, SAMSUNG_SCH_I330_ID) }, { USB_DEVICE(SAMSUNG_VENDOR_ID, SAMSUNG_SPH_I500_ID) }, { USB_DEVICE(TAPWAVE_VENDOR_ID, TAPWAVE_ZODIAC_ID) }, { USB_DEVICE(GARMIN_VENDOR_ID, GARMIN_IQUE_3600_ID) }, { USB_DEVICE(ACEECA_VENDOR_ID, ACEECA_MEZ1000_ID) }, { USB_DEVICE(KYOCERA_VENDOR_ID, KYOCERA_7135_ID) }, { USB_DEVICE(FOSSIL_VENDOR_ID, FOSSIL_ABACUS_ID) }, { } /* Terminating entry */ }; MODULE_DEVICE_TABLE(usb, id_table_combined); /* All of the device info needed for the Handspring Visor, and Palm 4.0 devices */ static struct usb_serial_driver handspring_device = { .driver = { .name = "visor", }, .description = "Handspring Visor / Palm OS", .id_table = id_table, .num_ports = 2, .bulk_out_size = 256, .open = visor_open, .close = visor_close, .throttle = usb_serial_generic_throttle, .unthrottle = usb_serial_generic_unthrottle, .probe = visor_probe, .calc_num_ports = visor_calc_num_ports, .read_int_callback = visor_read_int_callback, }; /* All of the device info needed for the Clie UX50, TH55 Palm 5.0 devices */ static struct usb_serial_driver clie_5_device = { .driver = { .name = "clie_5", }, .description = "Sony Clie 5.0", .id_table = clie_id_5_table, .num_ports = 2, .num_bulk_out = 2, .bulk_out_size = 256, .open = visor_open, .close = visor_close, .throttle = usb_serial_generic_throttle, .unthrottle = usb_serial_generic_unthrottle, .probe = visor_probe, .calc_num_ports = clie_5_calc_num_ports, .read_int_callback = visor_read_int_callback, }; /* device info for the Sony Clie OS version 3.5 */ static struct usb_serial_driver clie_3_5_device = { .driver = { .name = "clie_3.5", }, .description = "Sony Clie 3.5", .id_table = clie_id_3_5_table, .num_ports = 1, .bulk_out_size = 256, .open = visor_open, .close = visor_close, .throttle = usb_serial_generic_throttle, .unthrottle = usb_serial_generic_unthrottle, .attach = clie_3_5_startup, }; static struct usb_serial_driver * const serial_drivers[] = { &handspring_device, &clie_5_device, &clie_3_5_device, NULL }; /****************************************************************************** * Handspring Visor specific driver functions ******************************************************************************/ static int visor_open(struct tty_struct *tty, struct usb_serial_port *port) { int result = 0; if (!port->read_urb) { /* this is needed for some brain dead Sony devices */ dev_err(&port->dev, "Device lied about number of ports, please use a lower one.\n"); return -ENODEV; } /* Start reading from the device */ result = usb_serial_generic_open(tty, port); if (result) goto exit; if (port->interrupt_in_urb) { dev_dbg(&port->dev, "adding interrupt input for treo\n"); result = usb_submit_urb(port->interrupt_in_urb, GFP_KERNEL); if (result) dev_err(&port->dev, "%s - failed submitting interrupt urb, error %d\n", __func__, result); } exit: return result; } static void visor_close(struct usb_serial_port *port) { unsigned char *transfer_buffer; usb_serial_generic_close(port); usb_kill_urb(port->interrupt_in_urb); transfer_buffer = kmalloc(0x12, GFP_KERNEL); if (!transfer_buffer) return; usb_control_msg(port->serial->dev, usb_rcvctrlpipe(port->serial->dev, 0), VISOR_CLOSE_NOTIFICATION, 0xc2, 0x0000, 0x0000, transfer_buffer, 0x12, 300); kfree(transfer_buffer); } static void visor_read_int_callback(struct urb *urb) { struct usb_serial_port *port = urb->context; int status = urb->status; int result; switch (status) { case 0: /* success */ break; case -ECONNRESET: case -ENOENT: case -ESHUTDOWN: /* this urb is terminated, clean up */ dev_dbg(&port->dev, "%s - urb shutting down with status: %d\n", __func__, status); return; default: dev_dbg(&port->dev, "%s - nonzero urb status received: %d\n", __func__, status); goto exit; } /* * This information is still unknown what it can be used for. * If anyone has an idea, please let the author know... * * Rumor has it this endpoint is used to notify when data * is ready to be read from the bulk ones. */ usb_serial_debug_data(&port->dev, __func__, urb->actual_length, urb->transfer_buffer); exit: result = usb_submit_urb(urb, GFP_ATOMIC); if (result) dev_err(&urb->dev->dev, "%s - Error %d submitting interrupt urb\n", __func__, result); } static int palm_os_3_probe(struct usb_serial *serial, const struct usb_device_id *id) { struct device *dev = &serial->dev->dev; struct visor_connection_info *connection_info; unsigned char *transfer_buffer; char *string; int retval = 0; int i; int num_ports = 0; transfer_buffer = kmalloc(sizeof(*connection_info), GFP_KERNEL); if (!transfer_buffer) return -ENOMEM; /* send a get connection info request */ retval = usb_control_msg(serial->dev, usb_rcvctrlpipe(serial->dev, 0), VISOR_GET_CONNECTION_INFORMATION, 0xc2, 0x0000, 0x0000, transfer_buffer, sizeof(*connection_info), 300); if (retval < 0) { dev_err(dev, "%s - error %d getting connection information\n", __func__, retval); goto exit; } if (retval != sizeof(*connection_info)) { dev_err(dev, "Invalid connection information received from device\n"); retval = -ENODEV; goto exit; } connection_info = (struct visor_connection_info *)transfer_buffer; num_ports = le16_to_cpu(connection_info->num_ports); /* Handle devices that report invalid stuff here. */ if (num_ports == 0 || num_ports > 2) { dev_warn(dev, "%s: No valid connect info available\n", serial->type->description); num_ports = 2; } for (i = 0; i < num_ports; ++i) { switch (connection_info->connections[i].port_function_id) { case VISOR_FUNCTION_GENERIC: string = "Generic"; break; case VISOR_FUNCTION_DEBUGGER: string = "Debugger"; break; case VISOR_FUNCTION_HOTSYNC: string = "HotSync"; break; case VISOR_FUNCTION_CONSOLE: string = "Console"; break; case VISOR_FUNCTION_REMOTE_FILE_SYS: string = "Remote File System"; break; default: string = "unknown"; break; } dev_info(dev, "%s: port %d, is for %s use\n", serial->type->description, connection_info->connections[i].port, string); } dev_info(dev, "%s: Number of ports: %d\n", serial->type->description, num_ports); /* * save off our num_ports info so that we can use it in the * calc_num_ports callback */ usb_set_serial_data(serial, (void *)(long)num_ports); /* ask for the number of bytes available, but ignore the response as it is broken */ retval = usb_control_msg(serial->dev, usb_rcvctrlpipe(serial->dev, 0), VISOR_REQUEST_BYTES_AVAILABLE, 0xc2, 0x0000, 0x0005, transfer_buffer, 0x02, 300); if (retval < 0) dev_err(dev, "%s - error %d getting bytes available request\n", __func__, retval); retval = 0; exit: kfree(transfer_buffer); return retval; } static int palm_os_4_probe(struct usb_serial *serial, const struct usb_device_id *id) { struct device *dev = &serial->dev->dev; struct palm_ext_connection_info *connection_info; unsigned char *transfer_buffer; int retval; transfer_buffer = kmalloc(sizeof(*connection_info), GFP_KERNEL); if (!transfer_buffer) return -ENOMEM; retval = usb_control_msg(serial->dev, usb_rcvctrlpipe(serial->dev, 0), PALM_GET_EXT_CONNECTION_INFORMATION, 0xc2, 0x0000, 0x0000, transfer_buffer, sizeof(*connection_info), 300); if (retval < 0) dev_err(dev, "%s - error %d getting connection info\n", __func__, retval); else usb_serial_debug_data(dev, __func__, retval, transfer_buffer); kfree(transfer_buffer); return 0; } static int visor_probe(struct usb_serial *serial, const struct usb_device_id *id) { int retval = 0; int (*startup)(struct usb_serial *serial, const struct usb_device_id *id); /* * some Samsung Android phones in modem mode have the same ID * as SPH-I500, but they are ACM devices, so dont bind to them */ if (id->idVendor == SAMSUNG_VENDOR_ID && id->idProduct == SAMSUNG_SPH_I500_ID && serial->dev->descriptor.bDeviceClass == USB_CLASS_COMM && serial->dev->descriptor.bDeviceSubClass == USB_CDC_SUBCLASS_ACM) return -ENODEV; if (serial->dev->actconfig->desc.bConfigurationValue != 1) { dev_err(&serial->dev->dev, "active config #%d != 1 ??\n", serial->dev->actconfig->desc.bConfigurationValue); return -ENODEV; } if (id->driver_info) { startup = (void *)id->driver_info; retval = startup(serial, id); } return retval; } static int visor_calc_num_ports(struct usb_serial *serial, struct usb_serial_endpoints *epds) { unsigned int vid = le16_to_cpu(serial->dev->descriptor.idVendor); int num_ports = (int)(long)(usb_get_serial_data(serial)); if (num_ports) usb_set_serial_data(serial, NULL); /* * Only swap the bulk endpoints for the Handspring devices with * interrupt in endpoints, which for now are the Treo devices. */ if (!(vid == HANDSPRING_VENDOR_ID || vid == KYOCERA_VENDOR_ID) || epds->num_interrupt_in == 0) goto out; if (epds->num_bulk_in < 2 || epds->num_interrupt_in < 2) { dev_err(&serial->interface->dev, "missing endpoints\n"); return -ENODEV; } /* * It appears that Treos and Kyoceras want to use the * 1st bulk in endpoint to communicate with the 2nd bulk out endpoint, * so let's swap the 1st and 2nd bulk in and interrupt endpoints. * Note that swapping the bulk out endpoints would break lots of * apps that want to communicate on the second port. */ swap(epds->bulk_in[0], epds->bulk_in[1]); swap(epds->interrupt_in[0], epds->interrupt_in[1]); out: return num_ports; } static int clie_5_calc_num_ports(struct usb_serial *serial, struct usb_serial_endpoints *epds) { /* * TH55 registers 2 ports. * Communication in from the UX50/TH55 uses the first bulk-in * endpoint, while communication out to the UX50/TH55 uses the second * bulk-out endpoint. */ /* * FIXME: Should we swap the descriptors instead of using the same * bulk-out endpoint for both ports? */ epds->bulk_out[0] = epds->bulk_out[1]; return serial->type->num_ports; } static int clie_3_5_startup(struct usb_serial *serial) { struct device *dev = &serial->dev->dev; int result; u8 *data; data = kmalloc(1, GFP_KERNEL); if (!data) return -ENOMEM; /* * Note that PEG-300 series devices expect the following two calls. */ /* get the config number */ result = usb_control_msg(serial->dev, usb_rcvctrlpipe(serial->dev, 0), USB_REQ_GET_CONFIGURATION, USB_DIR_IN, 0, 0, data, 1, 3000); if (result < 0) { dev_err(dev, "%s: get config number failed: %d\n", __func__, result); goto out; } if (result != 1) { dev_err(dev, "%s: get config number bad return length: %d\n", __func__, result); result = -EIO; goto out; } /* get the interface number */ result = usb_control_msg(serial->dev, usb_rcvctrlpipe(serial->dev, 0), USB_REQ_GET_INTERFACE, USB_DIR_IN | USB_RECIP_INTERFACE, 0, 0, data, 1, 3000); if (result < 0) { dev_err(dev, "%s: get interface number failed: %d\n", __func__, result); goto out; } if (result != 1) { dev_err(dev, "%s: get interface number bad return length: %d\n", __func__, result); result = -EIO; goto out; } result = 0; out: kfree(data); return result; } module_usb_serial_driver(serial_drivers, id_table_combined); MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL v2");
40 40 1 1 30 10 4 9 29 29 12 1 5 33 38 38 9 9 6 8 2 4 1 6 6 13 13 7 2 11 3 10 11 363 363 354 10 31 18 31 40 31 39 39 33 33 4 4 31 31 31 31 15 15 4 15 2 3 11 14 14 1 15 573 572 6 6 572 210 1 210 1 1 339 338 211 134 3 3 337 336 11 11 2 11 8 8 8 8 7 8 1 8 11 8 11 11 11 8 11 11 8 80 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 // SPDX-License-Identifier: GPL-2.0-or-later /* * Anycast support for IPv6 * Linux INET6 implementation * * Authors: * David L Stevens (dlstevens@us.ibm.com) * * based heavily on net/ipv6/mcast.c */ #include <linux/capability.h> #include <linux/module.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/random.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/in6.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/route.h> #include <linux/init.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/snmp.h> #include <net/ipv6.h> #include <net/protocol.h> #include <net/if_inet6.h> #include <net/ndisc.h> #include <net/addrconf.h> #include <net/ip6_route.h> #include <net/checksum.h> #define IN6_ADDR_HSIZE_SHIFT 8 #define IN6_ADDR_HSIZE BIT(IN6_ADDR_HSIZE_SHIFT) /* anycast address hash table */ static struct hlist_head inet6_acaddr_lst[IN6_ADDR_HSIZE]; static DEFINE_SPINLOCK(acaddr_hash_lock); #define ac_dereference(a, idev) \ rcu_dereference_protected(a, lockdep_is_held(&(idev)->lock)) static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr *addr); static u32 inet6_acaddr_hash(const struct net *net, const struct in6_addr *addr) { u32 val = __ipv6_addr_jhash(addr, net_hash_mix(net)); return hash_32(val, IN6_ADDR_HSIZE_SHIFT); } /* * socket join an anycast group */ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_ac_socklist *pac = NULL; struct net *net = sock_net(sk); netdevice_tracker dev_tracker; struct net_device *dev = NULL; struct inet6_dev *idev; int err = 0, ishost; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; if (ipv6_addr_is_multicast(addr)) return -EINVAL; if (ifindex) dev = netdev_get_by_index(net, ifindex, &dev_tracker, GFP_KERNEL); if (ipv6_chk_addr_and_flags(net, addr, dev, true, 0, IFA_F_TENTATIVE)) { err = -EINVAL; goto error; } pac = sock_kmalloc(sk, sizeof(struct ipv6_ac_socklist), GFP_KERNEL); if (!pac) { err = -ENOMEM; goto error; } pac->acl_next = NULL; pac->acl_addr = *addr; ishost = !READ_ONCE(net->ipv6.devconf_all->forwarding); if (ifindex == 0) { struct rt6_info *rt; rcu_read_lock(); rt = rt6_lookup(net, addr, NULL, 0, NULL, 0); if (rt) { dev = dst_dev_rcu(&rt->dst); netdev_hold(dev, &dev_tracker, GFP_ATOMIC); ip6_rt_put(rt); } else if (ishost) { rcu_read_unlock(); err = -EADDRNOTAVAIL; goto error; } else { /* router, no matching interface: just pick one */ dev = netdev_get_by_flags_rcu(net, &dev_tracker, IFF_UP, IFF_UP | IFF_LOOPBACK); } rcu_read_unlock(); } if (!dev) { err = -ENODEV; goto error; } idev = in6_dev_get(dev); if (!idev) { if (ifindex) err = -ENODEV; else err = -EADDRNOTAVAIL; goto error; } /* reset ishost, now that we have a specific device */ ishost = !READ_ONCE(idev->cnf.forwarding); pac->acl_ifindex = dev->ifindex; /* XXX * For hosts, allow link-local or matching prefix anycasts. * This obviates the need for propagating anycast routes while * still allowing some non-router anycast participation. */ if (!ipv6_chk_prefix(addr, dev)) { if (ishost) err = -EADDRNOTAVAIL; if (err) goto error_idev; } err = __ipv6_dev_ac_inc(idev, addr); if (!err) { pac->acl_next = np->ipv6_ac_list; np->ipv6_ac_list = pac; pac = NULL; } error_idev: in6_dev_put(idev); error: netdev_put(dev, &dev_tracker); if (pac) sock_kfree_s(sk, pac, sizeof(*pac)); return err; } /* * socket leave an anycast group */ int ipv6_sock_ac_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) { struct ipv6_ac_socklist *pac, *prev_pac; struct ipv6_pinfo *np = inet6_sk(sk); struct net *net = sock_net(sk); struct net_device *dev; prev_pac = NULL; for (pac = np->ipv6_ac_list; pac; pac = pac->acl_next) { if ((ifindex == 0 || pac->acl_ifindex == ifindex) && ipv6_addr_equal(&pac->acl_addr, addr)) break; prev_pac = pac; } if (!pac) return -ENOENT; if (prev_pac) prev_pac->acl_next = pac->acl_next; else np->ipv6_ac_list = pac->acl_next; dev = dev_get_by_index(net, pac->acl_ifindex); if (dev) { ipv6_dev_ac_dec(dev, &pac->acl_addr); dev_put(dev); } sock_kfree_s(sk, pac, sizeof(*pac)); return 0; } void __ipv6_sock_ac_close(struct sock *sk) { struct ipv6_pinfo *np = inet6_sk(sk); struct net *net = sock_net(sk); struct net_device *dev = NULL; struct ipv6_ac_socklist *pac; int prev_index = 0; pac = np->ipv6_ac_list; np->ipv6_ac_list = NULL; while (pac) { struct ipv6_ac_socklist *next = pac->acl_next; if (pac->acl_ifindex != prev_index) { dev_put(dev); dev = dev_get_by_index(net, pac->acl_ifindex); prev_index = pac->acl_ifindex; } if (dev) ipv6_dev_ac_dec(dev, &pac->acl_addr); sock_kfree_s(sk, pac, sizeof(*pac)); pac = next; } dev_put(dev); } void ipv6_sock_ac_close(struct sock *sk) { struct ipv6_pinfo *np = inet6_sk(sk); if (!np->ipv6_ac_list) return; __ipv6_sock_ac_close(sk); } static void ipv6_add_acaddr_hash(struct net *net, struct ifacaddr6 *aca) { unsigned int hash = inet6_acaddr_hash(net, &aca->aca_addr); spin_lock(&acaddr_hash_lock); hlist_add_head_rcu(&aca->aca_addr_lst, &inet6_acaddr_lst[hash]); spin_unlock(&acaddr_hash_lock); } static void ipv6_del_acaddr_hash(struct ifacaddr6 *aca) { spin_lock(&acaddr_hash_lock); hlist_del_init_rcu(&aca->aca_addr_lst); spin_unlock(&acaddr_hash_lock); } static void aca_get(struct ifacaddr6 *aca) { refcount_inc(&aca->aca_refcnt); } static void aca_free_rcu(struct rcu_head *h) { struct ifacaddr6 *aca = container_of(h, struct ifacaddr6, rcu); fib6_info_release(aca->aca_rt); kfree(aca); } static void aca_put(struct ifacaddr6 *ac) { if (refcount_dec_and_test(&ac->aca_refcnt)) call_rcu_hurry(&ac->rcu, aca_free_rcu); } static struct ifacaddr6 *aca_alloc(struct fib6_info *f6i, const struct in6_addr *addr) { struct ifacaddr6 *aca; aca = kzalloc(sizeof(*aca), GFP_ATOMIC); if (!aca) return NULL; aca->aca_addr = *addr; fib6_info_hold(f6i); aca->aca_rt = f6i; INIT_HLIST_NODE(&aca->aca_addr_lst); aca->aca_users = 1; /* aca_tstamp should be updated upon changes */ aca->aca_cstamp = aca->aca_tstamp = jiffies; refcount_set(&aca->aca_refcnt, 1); return aca; } static void inet6_ifacaddr_notify(struct net_device *dev, const struct ifacaddr6 *ifaca, int event) { struct inet6_fill_args fillargs = { .event = event, .netnsid = -1, }; struct net *net = dev_net(dev); struct sk_buff *skb; int err = -ENOMEM; skb = nlmsg_new(NLMSG_ALIGN(sizeof(struct ifaddrmsg)) + nla_total_size(sizeof(struct in6_addr)) + nla_total_size(sizeof(struct ifa_cacheinfo)), GFP_KERNEL); if (!skb) goto error; err = inet6_fill_ifacaddr(skb, ifaca, &fillargs); if (err < 0) { pr_err("Failed to fill in anycast addresses (err %d)\n", err); nlmsg_free(skb); goto error; } rtnl_notify(skb, net, 0, RTNLGRP_IPV6_ACADDR, NULL, GFP_KERNEL); return; error: rtnl_set_sk_err(net, RTNLGRP_IPV6_ACADDR, err); } /* * device anycast group inc (add if not found) */ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr) { struct ifacaddr6 *aca; struct fib6_info *f6i; struct net *net; int err; write_lock_bh(&idev->lock); if (idev->dead) { err = -ENODEV; goto out; } for (aca = ac_dereference(idev->ac_list, idev); aca; aca = ac_dereference(aca->aca_next, idev)) { if (ipv6_addr_equal(&aca->aca_addr, addr)) { aca->aca_users++; err = 0; goto out; } } net = dev_net(idev->dev); f6i = addrconf_f6i_alloc(net, idev, addr, true, GFP_ATOMIC, NULL); if (IS_ERR(f6i)) { err = PTR_ERR(f6i); goto out; } aca = aca_alloc(f6i, addr); if (!aca) { fib6_info_release(f6i); err = -ENOMEM; goto out; } /* Hold this for addrconf_join_solict() below before we unlock, * it is already exposed via idev->ac_list. */ aca_get(aca); aca->aca_next = idev->ac_list; rcu_assign_pointer(idev->ac_list, aca); write_unlock_bh(&idev->lock); ipv6_add_acaddr_hash(net, aca); ip6_ins_rt(net, f6i); addrconf_join_solict(idev->dev, &aca->aca_addr); inet6_ifacaddr_notify(idev->dev, aca, RTM_NEWANYCAST); aca_put(aca); return 0; out: write_unlock_bh(&idev->lock); return err; } /* * device anycast group decrement */ int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr) { struct ifacaddr6 *aca, *prev_aca; write_lock_bh(&idev->lock); prev_aca = NULL; for (aca = ac_dereference(idev->ac_list, idev); aca; aca = ac_dereference(aca->aca_next, idev)) { if (ipv6_addr_equal(&aca->aca_addr, addr)) break; prev_aca = aca; } if (!aca) { write_unlock_bh(&idev->lock); return -ENOENT; } if (--aca->aca_users > 0) { write_unlock_bh(&idev->lock); return 0; } if (prev_aca) rcu_assign_pointer(prev_aca->aca_next, aca->aca_next); else rcu_assign_pointer(idev->ac_list, aca->aca_next); write_unlock_bh(&idev->lock); ipv6_del_acaddr_hash(aca); addrconf_leave_solict(idev, &aca->aca_addr); ip6_del_rt(dev_net(idev->dev), aca->aca_rt, false); inet6_ifacaddr_notify(idev->dev, aca, RTM_DELANYCAST); aca_put(aca); return 0; } static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr *addr) { struct inet6_dev *idev = in6_dev_get(dev); int err; if (!idev) return -ENODEV; err = __ipv6_dev_ac_dec(idev, addr); in6_dev_put(idev); return err; } void ipv6_ac_destroy_dev(struct inet6_dev *idev) { struct ifacaddr6 *aca; write_lock_bh(&idev->lock); while ((aca = ac_dereference(idev->ac_list, idev)) != NULL) { rcu_assign_pointer(idev->ac_list, aca->aca_next); write_unlock_bh(&idev->lock); ipv6_del_acaddr_hash(aca); addrconf_leave_solict(idev, &aca->aca_addr); ip6_del_rt(dev_net(idev->dev), aca->aca_rt, false); aca_put(aca); write_lock_bh(&idev->lock); } write_unlock_bh(&idev->lock); } /* * check if the interface has this anycast address * called with rcu_read_lock() */ static bool ipv6_chk_acast_dev(struct net_device *dev, const struct in6_addr *addr) { struct inet6_dev *idev; struct ifacaddr6 *aca; idev = __in6_dev_get(dev); if (idev) { for (aca = rcu_dereference(idev->ac_list); aca; aca = rcu_dereference(aca->aca_next)) if (ipv6_addr_equal(&aca->aca_addr, addr)) break; return aca != NULL; } return false; } /* * check if given interface (or any, if dev==0) has this anycast address */ bool ipv6_chk_acast_addr(struct net *net, struct net_device *dev, const struct in6_addr *addr) { struct net_device *nh_dev; struct ifacaddr6 *aca; bool found = false; rcu_read_lock(); if (dev) found = ipv6_chk_acast_dev(dev, addr); else { unsigned int hash = inet6_acaddr_hash(net, addr); hlist_for_each_entry_rcu(aca, &inet6_acaddr_lst[hash], aca_addr_lst) { nh_dev = fib6_info_nh_dev(aca->aca_rt); if (!nh_dev || !net_eq(dev_net(nh_dev), net)) continue; if (ipv6_addr_equal(&aca->aca_addr, addr)) { found = true; break; } } } rcu_read_unlock(); return found; } /* check if this anycast address is link-local on given interface or * is global */ bool ipv6_chk_acast_addr_src(struct net *net, struct net_device *dev, const struct in6_addr *addr) { return ipv6_chk_acast_addr(net, (ipv6_addr_type(addr) & IPV6_ADDR_LINKLOCAL ? dev : NULL), addr); } #ifdef CONFIG_PROC_FS struct ac6_iter_state { struct seq_net_private p; struct net_device *dev; }; #define ac6_seq_private(seq) ((struct ac6_iter_state *)(seq)->private) static inline struct ifacaddr6 *ac6_get_first(struct seq_file *seq) { struct ac6_iter_state *state = ac6_seq_private(seq); struct net *net = seq_file_net(seq); struct ifacaddr6 *im = NULL; for_each_netdev_rcu(net, state->dev) { struct inet6_dev *idev; idev = __in6_dev_get(state->dev); if (!idev) continue; im = rcu_dereference(idev->ac_list); if (im) break; } return im; } static struct ifacaddr6 *ac6_get_next(struct seq_file *seq, struct ifacaddr6 *im) { struct ac6_iter_state *state = ac6_seq_private(seq); struct inet6_dev *idev; im = rcu_dereference(im->aca_next); while (!im) { state->dev = next_net_device_rcu(state->dev); if (!state->dev) break; idev = __in6_dev_get(state->dev); if (!idev) continue; im = rcu_dereference(idev->ac_list); } return im; } static struct ifacaddr6 *ac6_get_idx(struct seq_file *seq, loff_t pos) { struct ifacaddr6 *im = ac6_get_first(seq); if (im) while (pos && (im = ac6_get_next(seq, im)) != NULL) --pos; return pos ? NULL : im; } static void *ac6_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { rcu_read_lock(); return ac6_get_idx(seq, *pos); } static void *ac6_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct ifacaddr6 *im = ac6_get_next(seq, v); ++*pos; return im; } static void ac6_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static int ac6_seq_show(struct seq_file *seq, void *v) { struct ifacaddr6 *im = (struct ifacaddr6 *)v; struct ac6_iter_state *state = ac6_seq_private(seq); seq_printf(seq, "%-4d %-15s %pi6 %5d\n", state->dev->ifindex, state->dev->name, &im->aca_addr, im->aca_users); return 0; } static const struct seq_operations ac6_seq_ops = { .start = ac6_seq_start, .next = ac6_seq_next, .stop = ac6_seq_stop, .show = ac6_seq_show, }; int __net_init ac6_proc_init(struct net *net) { if (!proc_create_net("anycast6", 0444, net->proc_net, &ac6_seq_ops, sizeof(struct ac6_iter_state))) return -ENOMEM; return 0; } void ac6_proc_exit(struct net *net) { remove_proc_entry("anycast6", net->proc_net); } #endif /* Init / cleanup code */ int __init ipv6_anycast_init(void) { int i; for (i = 0; i < IN6_ADDR_HSIZE; i++) INIT_HLIST_HEAD(&inet6_acaddr_lst[i]); return 0; } void ipv6_anycast_cleanup(void) { int i; spin_lock(&acaddr_hash_lock); for (i = 0; i < IN6_ADDR_HSIZE; i++) WARN_ON(!hlist_empty(&inet6_acaddr_lst[i])); spin_unlock(&acaddr_hash_lock); }
15 15 15 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 // SPDX-License-Identifier: GPL-2.0-only /* * ebtable_broute * * Authors: * Bart De Schuymer <bdschuym@pandora.be> * * April, 2002 * * This table lets you choose between routing and bridging for frames * entering on a bridge enslaved nic. This table is traversed before any * other ebtables table. See net/bridge/br_input.c. */ #include <linux/netfilter_bridge/ebtables.h> #include <linux/module.h> #include <linux/if_bridge.h> #include "../br_private.h" /* EBT_ACCEPT means the frame will be bridged * EBT_DROP means the frame will be routed */ static struct ebt_entries initial_chain = { .name = "BROUTING", .policy = EBT_ACCEPT, }; static struct ebt_replace_kernel initial_table = { .name = "broute", .valid_hooks = 1 << NF_BR_BROUTING, .entries_size = sizeof(struct ebt_entries), .hook_entry = { [NF_BR_BROUTING] = &initial_chain, }, .entries = (char *)&initial_chain, }; static const struct ebt_table broute_table = { .name = "broute", .table = &initial_table, .valid_hooks = 1 << NF_BR_BROUTING, .me = THIS_MODULE, }; static unsigned int ebt_broute(void *priv, struct sk_buff *skb, const struct nf_hook_state *s) { struct net_bridge_port *p = br_port_get_rcu(skb->dev); struct nf_hook_state state; unsigned char *dest; int ret; if (!p || p->state != BR_STATE_FORWARDING) return NF_ACCEPT; nf_hook_state_init(&state, NF_BR_BROUTING, NFPROTO_BRIDGE, s->in, NULL, NULL, s->net, NULL); ret = ebt_do_table(priv, skb, &state); if (ret != NF_DROP) return ret; /* DROP in ebtables -t broute means that the * skb should be routed, not bridged. * This is awkward, but can't be changed for compatibility * reasons. * * We map DROP to ACCEPT and set the ->br_netfilter_broute flag. */ BR_INPUT_SKB_CB(skb)->br_netfilter_broute = 1; /* undo PACKET_HOST mangling done in br_input in case the dst * address matches the logical bridge but not the port. */ dest = eth_hdr(skb)->h_dest; if (skb->pkt_type == PACKET_HOST && !ether_addr_equal(skb->dev->dev_addr, dest) && ether_addr_equal(p->br->dev->dev_addr, dest)) skb->pkt_type = PACKET_OTHERHOST; return NF_ACCEPT; } static const struct nf_hook_ops ebt_ops_broute = { .hook = ebt_broute, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_PRE_ROUTING, .priority = NF_BR_PRI_FIRST, }; static int broute_table_init(struct net *net) { return ebt_register_table(net, &broute_table, &ebt_ops_broute); } static void __net_exit broute_net_pre_exit(struct net *net) { ebt_unregister_table_pre_exit(net, "broute"); } static void __net_exit broute_net_exit(struct net *net) { ebt_unregister_table(net, "broute"); } static struct pernet_operations broute_net_ops = { .exit = broute_net_exit, .pre_exit = broute_net_pre_exit, }; static int __init ebtable_broute_init(void) { int ret = ebt_register_template(&broute_table, broute_table_init); if (ret) return ret; ret = register_pernet_subsys(&broute_net_ops); if (ret) { ebt_unregister_template(&broute_table); return ret; } return 0; } static void __exit ebtable_broute_fini(void) { unregister_pernet_subsys(&broute_net_ops); ebt_unregister_template(&broute_table); } module_init(ebtable_broute_init); module_exit(ebtable_broute_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Force packets to be routed instead of bridged");
12 12 12 12 12 12 12 12 19 19 19 12 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 /* * Copyright (c) 2014 Chelsio, Inc. All rights reserved. * Copyright (c) 2014 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "iwpm_util.h" #define IWPM_MAPINFO_HASH_SIZE 512 #define IWPM_MAPINFO_HASH_MASK (IWPM_MAPINFO_HASH_SIZE - 1) #define IWPM_REMINFO_HASH_SIZE 64 #define IWPM_REMINFO_HASH_MASK (IWPM_REMINFO_HASH_SIZE - 1) #define IWPM_MSG_SIZE 512 static LIST_HEAD(iwpm_nlmsg_req_list); static DEFINE_SPINLOCK(iwpm_nlmsg_req_lock); static struct hlist_head *iwpm_hash_bucket; static DEFINE_SPINLOCK(iwpm_mapinfo_lock); static struct hlist_head *iwpm_reminfo_bucket; static DEFINE_SPINLOCK(iwpm_reminfo_lock); static struct iwpm_admin_data iwpm_admin; /** * iwpm_init - Allocate resources for the iwarp port mapper * @nl_client: The index of the netlink client * * Should be called when network interface goes up. */ int iwpm_init(u8 nl_client) { iwpm_hash_bucket = kcalloc(IWPM_MAPINFO_HASH_SIZE, sizeof(struct hlist_head), GFP_KERNEL); if (!iwpm_hash_bucket) return -ENOMEM; iwpm_reminfo_bucket = kcalloc(IWPM_REMINFO_HASH_SIZE, sizeof(struct hlist_head), GFP_KERNEL); if (!iwpm_reminfo_bucket) { kfree(iwpm_hash_bucket); return -ENOMEM; } iwpm_set_registration(nl_client, IWPM_REG_UNDEF); pr_debug("%s: Mapinfo and reminfo tables are created\n", __func__); return 0; } static void free_hash_bucket(void); static void free_reminfo_bucket(void); /** * iwpm_exit - Deallocate resources for the iwarp port mapper * @nl_client: The index of the netlink client * * Should be called when network interface goes down. */ int iwpm_exit(u8 nl_client) { free_hash_bucket(); free_reminfo_bucket(); pr_debug("%s: Resources are destroyed\n", __func__); iwpm_set_registration(nl_client, IWPM_REG_UNDEF); return 0; } static struct hlist_head *get_mapinfo_hash_bucket(struct sockaddr_storage *, struct sockaddr_storage *); /** * iwpm_create_mapinfo - Store local and mapped IPv4/IPv6 address * info in a hash table * @local_sockaddr: Local ip/tcp address * @mapped_sockaddr: Mapped local ip/tcp address * @nl_client: The index of the netlink client * @map_flags: IWPM mapping flags */ int iwpm_create_mapinfo(struct sockaddr_storage *local_sockaddr, struct sockaddr_storage *mapped_sockaddr, u8 nl_client, u32 map_flags) { struct hlist_head *hash_bucket_head = NULL; struct iwpm_mapping_info *map_info; unsigned long flags; int ret = -EINVAL; map_info = kzalloc(sizeof(struct iwpm_mapping_info), GFP_KERNEL); if (!map_info) return -ENOMEM; memcpy(&map_info->local_sockaddr, local_sockaddr, sizeof(struct sockaddr_storage)); memcpy(&map_info->mapped_sockaddr, mapped_sockaddr, sizeof(struct sockaddr_storage)); map_info->nl_client = nl_client; map_info->map_flags = map_flags; spin_lock_irqsave(&iwpm_mapinfo_lock, flags); if (iwpm_hash_bucket) { hash_bucket_head = get_mapinfo_hash_bucket( &map_info->local_sockaddr, &map_info->mapped_sockaddr); if (hash_bucket_head) { hlist_add_head(&map_info->hlist_node, hash_bucket_head); ret = 0; } } spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); if (!hash_bucket_head) kfree(map_info); return ret; } /** * iwpm_remove_mapinfo - Remove local and mapped IPv4/IPv6 address * info from the hash table * @local_sockaddr: Local ip/tcp address * @mapped_local_addr: Mapped local ip/tcp address * * Returns err code if mapping info is not found in the hash table, * otherwise returns 0 */ int iwpm_remove_mapinfo(struct sockaddr_storage *local_sockaddr, struct sockaddr_storage *mapped_local_addr) { struct hlist_node *tmp_hlist_node; struct hlist_head *hash_bucket_head; struct iwpm_mapping_info *map_info = NULL; unsigned long flags; int ret = -EINVAL; spin_lock_irqsave(&iwpm_mapinfo_lock, flags); if (iwpm_hash_bucket) { hash_bucket_head = get_mapinfo_hash_bucket( local_sockaddr, mapped_local_addr); if (!hash_bucket_head) goto remove_mapinfo_exit; hlist_for_each_entry_safe(map_info, tmp_hlist_node, hash_bucket_head, hlist_node) { if (!iwpm_compare_sockaddr(&map_info->mapped_sockaddr, mapped_local_addr)) { hlist_del_init(&map_info->hlist_node); kfree(map_info); ret = 0; break; } } } remove_mapinfo_exit: spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); return ret; } static void free_hash_bucket(void) { struct hlist_node *tmp_hlist_node; struct iwpm_mapping_info *map_info; unsigned long flags; int i; /* remove all the mapinfo data from the list */ spin_lock_irqsave(&iwpm_mapinfo_lock, flags); for (i = 0; i < IWPM_MAPINFO_HASH_SIZE; i++) { hlist_for_each_entry_safe(map_info, tmp_hlist_node, &iwpm_hash_bucket[i], hlist_node) { hlist_del_init(&map_info->hlist_node); kfree(map_info); } } /* free the hash list */ kfree(iwpm_hash_bucket); iwpm_hash_bucket = NULL; spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); } static void free_reminfo_bucket(void) { struct hlist_node *tmp_hlist_node; struct iwpm_remote_info *rem_info; unsigned long flags; int i; /* remove all the remote info from the list */ spin_lock_irqsave(&iwpm_reminfo_lock, flags); for (i = 0; i < IWPM_REMINFO_HASH_SIZE; i++) { hlist_for_each_entry_safe(rem_info, tmp_hlist_node, &iwpm_reminfo_bucket[i], hlist_node) { hlist_del_init(&rem_info->hlist_node); kfree(rem_info); } } /* free the hash list */ kfree(iwpm_reminfo_bucket); iwpm_reminfo_bucket = NULL; spin_unlock_irqrestore(&iwpm_reminfo_lock, flags); } static struct hlist_head *get_reminfo_hash_bucket(struct sockaddr_storage *, struct sockaddr_storage *); void iwpm_add_remote_info(struct iwpm_remote_info *rem_info) { struct hlist_head *hash_bucket_head; unsigned long flags; spin_lock_irqsave(&iwpm_reminfo_lock, flags); if (iwpm_reminfo_bucket) { hash_bucket_head = get_reminfo_hash_bucket( &rem_info->mapped_loc_sockaddr, &rem_info->mapped_rem_sockaddr); if (hash_bucket_head) hlist_add_head(&rem_info->hlist_node, hash_bucket_head); } spin_unlock_irqrestore(&iwpm_reminfo_lock, flags); } /** * iwpm_get_remote_info - Get the remote connecting peer address info * * @mapped_loc_addr: Mapped local address of the listening peer * @mapped_rem_addr: Mapped remote address of the connecting peer * @remote_addr: To store the remote address of the connecting peer * @nl_client: The index of the netlink client * * The remote address info is retrieved and provided to the client in * the remote_addr. After that it is removed from the hash table */ int iwpm_get_remote_info(struct sockaddr_storage *mapped_loc_addr, struct sockaddr_storage *mapped_rem_addr, struct sockaddr_storage *remote_addr, u8 nl_client) { struct hlist_node *tmp_hlist_node; struct hlist_head *hash_bucket_head; struct iwpm_remote_info *rem_info = NULL; unsigned long flags; int ret = -EINVAL; spin_lock_irqsave(&iwpm_reminfo_lock, flags); if (iwpm_reminfo_bucket) { hash_bucket_head = get_reminfo_hash_bucket( mapped_loc_addr, mapped_rem_addr); if (!hash_bucket_head) goto get_remote_info_exit; hlist_for_each_entry_safe(rem_info, tmp_hlist_node, hash_bucket_head, hlist_node) { if (!iwpm_compare_sockaddr(&rem_info->mapped_loc_sockaddr, mapped_loc_addr) && !iwpm_compare_sockaddr(&rem_info->mapped_rem_sockaddr, mapped_rem_addr)) { memcpy(remote_addr, &rem_info->remote_sockaddr, sizeof(struct sockaddr_storage)); iwpm_print_sockaddr(remote_addr, "get_remote_info: Remote sockaddr:"); hlist_del_init(&rem_info->hlist_node); kfree(rem_info); ret = 0; break; } } } get_remote_info_exit: spin_unlock_irqrestore(&iwpm_reminfo_lock, flags); return ret; } struct iwpm_nlmsg_request *iwpm_get_nlmsg_request(__u32 nlmsg_seq, u8 nl_client, gfp_t gfp) { struct iwpm_nlmsg_request *nlmsg_request; unsigned long flags; nlmsg_request = kzalloc(sizeof(struct iwpm_nlmsg_request), gfp); if (!nlmsg_request) return NULL; spin_lock_irqsave(&iwpm_nlmsg_req_lock, flags); list_add_tail(&nlmsg_request->inprocess_list, &iwpm_nlmsg_req_list); spin_unlock_irqrestore(&iwpm_nlmsg_req_lock, flags); kref_init(&nlmsg_request->kref); kref_get(&nlmsg_request->kref); nlmsg_request->nlmsg_seq = nlmsg_seq; nlmsg_request->nl_client = nl_client; nlmsg_request->request_done = 0; nlmsg_request->err_code = 0; sema_init(&nlmsg_request->sem, 1); down(&nlmsg_request->sem); return nlmsg_request; } void iwpm_free_nlmsg_request(struct kref *kref) { struct iwpm_nlmsg_request *nlmsg_request; unsigned long flags; nlmsg_request = container_of(kref, struct iwpm_nlmsg_request, kref); spin_lock_irqsave(&iwpm_nlmsg_req_lock, flags); list_del_init(&nlmsg_request->inprocess_list); spin_unlock_irqrestore(&iwpm_nlmsg_req_lock, flags); if (!nlmsg_request->request_done) pr_debug("%s Freeing incomplete nlmsg request (seq = %u).\n", __func__, nlmsg_request->nlmsg_seq); kfree(nlmsg_request); } struct iwpm_nlmsg_request *iwpm_find_nlmsg_request(__u32 echo_seq) { struct iwpm_nlmsg_request *nlmsg_request; struct iwpm_nlmsg_request *found_request = NULL; unsigned long flags; spin_lock_irqsave(&iwpm_nlmsg_req_lock, flags); list_for_each_entry(nlmsg_request, &iwpm_nlmsg_req_list, inprocess_list) { if (nlmsg_request->nlmsg_seq == echo_seq) { found_request = nlmsg_request; kref_get(&nlmsg_request->kref); break; } } spin_unlock_irqrestore(&iwpm_nlmsg_req_lock, flags); return found_request; } int iwpm_wait_complete_req(struct iwpm_nlmsg_request *nlmsg_request) { int ret; ret = down_timeout(&nlmsg_request->sem, IWPM_NL_TIMEOUT); if (ret) { ret = -EINVAL; pr_info("%s: Timeout %d sec for netlink request (seq = %u)\n", __func__, (IWPM_NL_TIMEOUT/HZ), nlmsg_request->nlmsg_seq); } else { ret = nlmsg_request->err_code; } kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request); return ret; } int iwpm_get_nlmsg_seq(void) { return atomic_inc_return(&iwpm_admin.nlmsg_seq); } /* valid client */ u32 iwpm_get_registration(u8 nl_client) { return iwpm_admin.reg_list[nl_client]; } /* valid client */ void iwpm_set_registration(u8 nl_client, u32 reg) { iwpm_admin.reg_list[nl_client] = reg; } /* valid client */ u32 iwpm_check_registration(u8 nl_client, u32 reg) { return (iwpm_get_registration(nl_client) & reg); } int iwpm_compare_sockaddr(struct sockaddr_storage *a_sockaddr, struct sockaddr_storage *b_sockaddr) { if (a_sockaddr->ss_family != b_sockaddr->ss_family) return 1; if (a_sockaddr->ss_family == AF_INET) { struct sockaddr_in *a4_sockaddr = (struct sockaddr_in *)a_sockaddr; struct sockaddr_in *b4_sockaddr = (struct sockaddr_in *)b_sockaddr; if (!memcmp(&a4_sockaddr->sin_addr, &b4_sockaddr->sin_addr, sizeof(struct in_addr)) && a4_sockaddr->sin_port == b4_sockaddr->sin_port) return 0; } else if (a_sockaddr->ss_family == AF_INET6) { struct sockaddr_in6 *a6_sockaddr = (struct sockaddr_in6 *)a_sockaddr; struct sockaddr_in6 *b6_sockaddr = (struct sockaddr_in6 *)b_sockaddr; if (!memcmp(&a6_sockaddr->sin6_addr, &b6_sockaddr->sin6_addr, sizeof(struct in6_addr)) && a6_sockaddr->sin6_port == b6_sockaddr->sin6_port) return 0; } else { pr_err("%s: Invalid sockaddr family\n", __func__); } return 1; } struct sk_buff *iwpm_create_nlmsg(u32 nl_op, struct nlmsghdr **nlh, int nl_client) { struct sk_buff *skb = NULL; skb = dev_alloc_skb(IWPM_MSG_SIZE); if (!skb) goto create_nlmsg_exit; if (!(ibnl_put_msg(skb, nlh, 0, 0, nl_client, nl_op, NLM_F_REQUEST))) { pr_warn("%s: Unable to put the nlmsg header\n", __func__); dev_kfree_skb(skb); skb = NULL; } create_nlmsg_exit: return skb; } int iwpm_parse_nlmsg(struct netlink_callback *cb, int policy_max, const struct nla_policy *nlmsg_policy, struct nlattr *nltb[], const char *msg_type) { int nlh_len = 0; int ret; const char *err_str = ""; ret = nlmsg_validate_deprecated(cb->nlh, nlh_len, policy_max - 1, nlmsg_policy, NULL); if (ret) { err_str = "Invalid attribute"; goto parse_nlmsg_error; } ret = nlmsg_parse_deprecated(cb->nlh, nlh_len, nltb, policy_max - 1, nlmsg_policy, NULL); if (ret) { err_str = "Unable to parse the nlmsg"; goto parse_nlmsg_error; } ret = iwpm_validate_nlmsg_attr(nltb, policy_max); if (ret) { err_str = "Invalid NULL attribute"; goto parse_nlmsg_error; } return 0; parse_nlmsg_error: pr_warn("%s: %s (msg type %s ret = %d)\n", __func__, err_str, msg_type, ret); return ret; } void iwpm_print_sockaddr(struct sockaddr_storage *sockaddr, char *msg) { struct sockaddr_in6 *sockaddr_v6; struct sockaddr_in *sockaddr_v4; switch (sockaddr->ss_family) { case AF_INET: sockaddr_v4 = (struct sockaddr_in *)sockaddr; pr_debug("%s IPV4 %pI4: %u(0x%04X)\n", msg, &sockaddr_v4->sin_addr, ntohs(sockaddr_v4->sin_port), ntohs(sockaddr_v4->sin_port)); break; case AF_INET6: sockaddr_v6 = (struct sockaddr_in6 *)sockaddr; pr_debug("%s IPV6 %pI6: %u(0x%04X)\n", msg, &sockaddr_v6->sin6_addr, ntohs(sockaddr_v6->sin6_port), ntohs(sockaddr_v6->sin6_port)); break; default: break; } } static u32 iwpm_ipv6_jhash(struct sockaddr_in6 *ipv6_sockaddr) { u32 ipv6_hash = jhash(&ipv6_sockaddr->sin6_addr, sizeof(struct in6_addr), 0); u32 hash = jhash_2words(ipv6_hash, (__force u32) ipv6_sockaddr->sin6_port, 0); return hash; } static u32 iwpm_ipv4_jhash(struct sockaddr_in *ipv4_sockaddr) { u32 ipv4_hash = jhash(&ipv4_sockaddr->sin_addr, sizeof(struct in_addr), 0); u32 hash = jhash_2words(ipv4_hash, (__force u32) ipv4_sockaddr->sin_port, 0); return hash; } static int get_hash_bucket(struct sockaddr_storage *a_sockaddr, struct sockaddr_storage *b_sockaddr, u32 *hash) { u32 a_hash, b_hash; if (a_sockaddr->ss_family == AF_INET) { a_hash = iwpm_ipv4_jhash((struct sockaddr_in *) a_sockaddr); b_hash = iwpm_ipv4_jhash((struct sockaddr_in *) b_sockaddr); } else if (a_sockaddr->ss_family == AF_INET6) { a_hash = iwpm_ipv6_jhash((struct sockaddr_in6 *) a_sockaddr); b_hash = iwpm_ipv6_jhash((struct sockaddr_in6 *) b_sockaddr); } else { pr_err("%s: Invalid sockaddr family\n", __func__); return -EINVAL; } if (a_hash == b_hash) /* if port mapper isn't available */ *hash = a_hash; else *hash = jhash_2words(a_hash, b_hash, 0); return 0; } static struct hlist_head *get_mapinfo_hash_bucket(struct sockaddr_storage *local_sockaddr, struct sockaddr_storage *mapped_sockaddr) { u32 hash; int ret; ret = get_hash_bucket(local_sockaddr, mapped_sockaddr, &hash); if (ret) return NULL; return &iwpm_hash_bucket[hash & IWPM_MAPINFO_HASH_MASK]; } static struct hlist_head *get_reminfo_hash_bucket(struct sockaddr_storage *mapped_loc_sockaddr, struct sockaddr_storage *mapped_rem_sockaddr) { u32 hash; int ret; ret = get_hash_bucket(mapped_loc_sockaddr, mapped_rem_sockaddr, &hash); if (ret) return NULL; return &iwpm_reminfo_bucket[hash & IWPM_REMINFO_HASH_MASK]; } static int send_mapinfo_num(u32 mapping_num, u8 nl_client, int iwpm_pid) { struct sk_buff *skb = NULL; struct nlmsghdr *nlh; u32 msg_seq; const char *err_str = ""; int ret = -EINVAL; skb = iwpm_create_nlmsg(RDMA_NL_IWPM_MAPINFO_NUM, &nlh, nl_client); if (!skb) { err_str = "Unable to create a nlmsg"; goto mapinfo_num_error; } nlh->nlmsg_seq = iwpm_get_nlmsg_seq(); msg_seq = 0; err_str = "Unable to put attribute of mapinfo number nlmsg"; ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq, IWPM_NLA_MAPINFO_SEQ); if (ret) goto mapinfo_num_error; ret = ibnl_put_attr(skb, nlh, sizeof(u32), &mapping_num, IWPM_NLA_MAPINFO_SEND_NUM); if (ret) goto mapinfo_num_error; nlmsg_end(skb, nlh); ret = rdma_nl_unicast(&init_net, skb, iwpm_pid); if (ret) { skb = NULL; err_str = "Unable to send a nlmsg"; goto mapinfo_num_error; } pr_debug("%s: Sent mapping number = %u\n", __func__, mapping_num); return 0; mapinfo_num_error: pr_info("%s: %s\n", __func__, err_str); dev_kfree_skb(skb); return ret; } static int send_nlmsg_done(struct sk_buff *skb, u8 nl_client, int iwpm_pid) { struct nlmsghdr *nlh = NULL; int ret = 0; if (!skb) return ret; if (!(ibnl_put_msg(skb, &nlh, 0, 0, nl_client, RDMA_NL_IWPM_MAPINFO, NLM_F_MULTI))) { pr_warn("%s Unable to put NLMSG_DONE\n", __func__); dev_kfree_skb(skb); return -ENOMEM; } nlh->nlmsg_type = NLMSG_DONE; ret = rdma_nl_unicast(&init_net, skb, iwpm_pid); if (ret) pr_warn("%s Unable to send a nlmsg\n", __func__); return ret; } int iwpm_send_mapinfo(u8 nl_client, int iwpm_pid) { struct iwpm_mapping_info *map_info; struct sk_buff *skb = NULL; struct nlmsghdr *nlh; int skb_num = 0, mapping_num = 0; int i = 0, nlmsg_bytes = 0; unsigned long flags; const char *err_str = ""; int ret; skb = dev_alloc_skb(NLMSG_GOODSIZE); if (!skb) { ret = -ENOMEM; err_str = "Unable to allocate skb"; goto send_mapping_info_exit; } skb_num++; spin_lock_irqsave(&iwpm_mapinfo_lock, flags); ret = -EINVAL; for (i = 0; i < IWPM_MAPINFO_HASH_SIZE; i++) { hlist_for_each_entry(map_info, &iwpm_hash_bucket[i], hlist_node) { if (map_info->nl_client != nl_client) continue; nlh = NULL; if (!(ibnl_put_msg(skb, &nlh, 0, 0, nl_client, RDMA_NL_IWPM_MAPINFO, NLM_F_MULTI))) { ret = -ENOMEM; err_str = "Unable to put the nlmsg header"; goto send_mapping_info_unlock; } err_str = "Unable to put attribute of the nlmsg"; ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage), &map_info->local_sockaddr, IWPM_NLA_MAPINFO_LOCAL_ADDR); if (ret) goto send_mapping_info_unlock; ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage), &map_info->mapped_sockaddr, IWPM_NLA_MAPINFO_MAPPED_ADDR); if (ret) goto send_mapping_info_unlock; if (iwpm_ulib_version > IWPM_UABI_VERSION_MIN) { ret = ibnl_put_attr(skb, nlh, sizeof(u32), &map_info->map_flags, IWPM_NLA_MAPINFO_FLAGS); if (ret) goto send_mapping_info_unlock; } nlmsg_end(skb, nlh); iwpm_print_sockaddr(&map_info->local_sockaddr, "send_mapping_info: Local sockaddr:"); iwpm_print_sockaddr(&map_info->mapped_sockaddr, "send_mapping_info: Mapped local sockaddr:"); mapping_num++; nlmsg_bytes += nlh->nlmsg_len; /* check if all mappings can fit in one skb */ if (NLMSG_GOODSIZE - nlmsg_bytes < nlh->nlmsg_len * 2) { /* and leave room for NLMSG_DONE */ nlmsg_bytes = 0; skb_num++; spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); /* send the skb */ ret = send_nlmsg_done(skb, nl_client, iwpm_pid); skb = NULL; if (ret) { err_str = "Unable to send map info"; goto send_mapping_info_exit; } if (skb_num == IWPM_MAPINFO_SKB_COUNT) { ret = -ENOMEM; err_str = "Insufficient skbs for map info"; goto send_mapping_info_exit; } skb = dev_alloc_skb(NLMSG_GOODSIZE); if (!skb) { ret = -ENOMEM; err_str = "Unable to allocate skb"; goto send_mapping_info_exit; } spin_lock_irqsave(&iwpm_mapinfo_lock, flags); } } } send_mapping_info_unlock: spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); send_mapping_info_exit: if (ret) { pr_warn("%s: %s (ret = %d)\n", __func__, err_str, ret); dev_kfree_skb(skb); return ret; } send_nlmsg_done(skb, nl_client, iwpm_pid); return send_mapinfo_num(mapping_num, nl_client, iwpm_pid); } int iwpm_mapinfo_available(void) { unsigned long flags; int full_bucket = 0, i = 0; spin_lock_irqsave(&iwpm_mapinfo_lock, flags); if (iwpm_hash_bucket) { for (i = 0; i < IWPM_MAPINFO_HASH_SIZE; i++) { if (!hlist_empty(&iwpm_hash_bucket[i])) { full_bucket = 1; break; } } } spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); return full_bucket; } int iwpm_send_hello(u8 nl_client, int iwpm_pid, u16 abi_version) { struct sk_buff *skb = NULL; struct nlmsghdr *nlh; const char *err_str; int ret = -EINVAL; skb = iwpm_create_nlmsg(RDMA_NL_IWPM_HELLO, &nlh, nl_client); if (!skb) { err_str = "Unable to create a nlmsg"; goto hello_num_error; } nlh->nlmsg_seq = iwpm_get_nlmsg_seq(); err_str = "Unable to put attribute of abi_version into nlmsg"; ret = ibnl_put_attr(skb, nlh, sizeof(u16), &abi_version, IWPM_NLA_HELLO_ABI_VERSION); if (ret) goto hello_num_error; nlmsg_end(skb, nlh); ret = rdma_nl_unicast(&init_net, skb, iwpm_pid); if (ret) { skb = NULL; err_str = "Unable to send a nlmsg"; goto hello_num_error; } pr_debug("%s: Sent hello abi_version = %u\n", __func__, abi_version); return 0; hello_num_error: pr_info("%s: %s\n", __func__, err_str); dev_kfree_skb(skb); return ret; }
4 2 2 2 1 1 1 1 6 6 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C)2006 USAGI/WIDE Project * * Author: * Kazunori Miyazawa <miyazawa@linux-ipv6.org> */ #include <crypto/internal/cipher.h> #include <crypto/internal/hash.h> #include <crypto/utils.h> #include <linux/err.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/string.h> static u_int32_t ks[12] = {0x01010101, 0x01010101, 0x01010101, 0x01010101, 0x02020202, 0x02020202, 0x02020202, 0x02020202, 0x03030303, 0x03030303, 0x03030303, 0x03030303}; /* * +------------------------ * | <parent tfm> * +------------------------ * | xcbc_tfm_ctx * +------------------------ * | consts (block size * 2) * +------------------------ */ struct xcbc_tfm_ctx { struct crypto_cipher *child; u8 consts[]; }; #define XCBC_BLOCKSIZE 16 static int crypto_xcbc_digest_setkey(struct crypto_shash *parent, const u8 *inkey, unsigned int keylen) { struct xcbc_tfm_ctx *ctx = crypto_shash_ctx(parent); u8 *consts = ctx->consts; int err = 0; u8 key1[XCBC_BLOCKSIZE]; int bs = sizeof(key1); if ((err = crypto_cipher_setkey(ctx->child, inkey, keylen))) return err; crypto_cipher_encrypt_one(ctx->child, consts, (u8 *)ks + bs); crypto_cipher_encrypt_one(ctx->child, consts + bs, (u8 *)ks + bs * 2); crypto_cipher_encrypt_one(ctx->child, key1, (u8 *)ks); return crypto_cipher_setkey(ctx->child, key1, bs); } static int crypto_xcbc_digest_init(struct shash_desc *pdesc) { int bs = crypto_shash_blocksize(pdesc->tfm); u8 *prev = shash_desc_ctx(pdesc); memset(prev, 0, bs); return 0; } static int crypto_xcbc_digest_update(struct shash_desc *pdesc, const u8 *p, unsigned int len) { struct crypto_shash *parent = pdesc->tfm; struct xcbc_tfm_ctx *tctx = crypto_shash_ctx(parent); struct crypto_cipher *tfm = tctx->child; int bs = crypto_shash_blocksize(parent); u8 *prev = shash_desc_ctx(pdesc); do { crypto_xor(prev, p, bs); crypto_cipher_encrypt_one(tfm, prev, prev); p += bs; len -= bs; } while (len >= bs); return len; } static int crypto_xcbc_digest_finup(struct shash_desc *pdesc, const u8 *src, unsigned int len, u8 *out) { struct crypto_shash *parent = pdesc->tfm; struct xcbc_tfm_ctx *tctx = crypto_shash_ctx(parent); struct crypto_cipher *tfm = tctx->child; int bs = crypto_shash_blocksize(parent); u8 *prev = shash_desc_ctx(pdesc); unsigned int offset = 0; crypto_xor(prev, src, len); if (len != bs) { prev[len] ^= 0x80; offset += bs; } crypto_xor(prev, &tctx->consts[offset], bs); crypto_cipher_encrypt_one(tfm, out, prev); return 0; } static int xcbc_init_tfm(struct crypto_tfm *tfm) { struct crypto_cipher *cipher; struct crypto_instance *inst = (void *)tfm->__crt_alg; struct crypto_cipher_spawn *spawn = crypto_instance_ctx(inst); struct xcbc_tfm_ctx *ctx = crypto_tfm_ctx(tfm); cipher = crypto_spawn_cipher(spawn); if (IS_ERR(cipher)) return PTR_ERR(cipher); ctx->child = cipher; return 0; }; static void xcbc_exit_tfm(struct crypto_tfm *tfm) { struct xcbc_tfm_ctx *ctx = crypto_tfm_ctx(tfm); crypto_free_cipher(ctx->child); } static int xcbc_create(struct crypto_template *tmpl, struct rtattr **tb) { struct shash_instance *inst; struct crypto_cipher_spawn *spawn; struct crypto_alg *alg; u32 mask; int err; err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_SHASH, &mask); if (err) return err; inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL); if (!inst) return -ENOMEM; spawn = shash_instance_ctx(inst); err = crypto_grab_cipher(spawn, shash_crypto_instance(inst), crypto_attr_alg_name(tb[1]), 0, mask); if (err) goto err_free_inst; alg = crypto_spawn_cipher_alg(spawn); err = -EINVAL; if (alg->cra_blocksize != XCBC_BLOCKSIZE) goto err_free_inst; err = crypto_inst_setname(shash_crypto_instance(inst), tmpl->name, alg); if (err) goto err_free_inst; inst->alg.base.cra_priority = alg->cra_priority; inst->alg.base.cra_blocksize = alg->cra_blocksize; inst->alg.base.cra_ctxsize = sizeof(struct xcbc_tfm_ctx) + alg->cra_blocksize * 2; inst->alg.base.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | CRYPTO_AHASH_ALG_FINAL_NONZERO; inst->alg.digestsize = alg->cra_blocksize; inst->alg.descsize = alg->cra_blocksize; inst->alg.base.cra_init = xcbc_init_tfm; inst->alg.base.cra_exit = xcbc_exit_tfm; inst->alg.init = crypto_xcbc_digest_init; inst->alg.update = crypto_xcbc_digest_update; inst->alg.finup = crypto_xcbc_digest_finup; inst->alg.setkey = crypto_xcbc_digest_setkey; inst->free = shash_free_singlespawn_instance; err = shash_register_instance(tmpl, inst); if (err) { err_free_inst: shash_free_singlespawn_instance(inst); } return err; } static struct crypto_template crypto_xcbc_tmpl = { .name = "xcbc", .create = xcbc_create, .module = THIS_MODULE, }; static int __init crypto_xcbc_module_init(void) { return crypto_register_template(&crypto_xcbc_tmpl); } static void __exit crypto_xcbc_module_exit(void) { crypto_unregister_template(&crypto_xcbc_tmpl); } module_init(crypto_xcbc_module_init); module_exit(crypto_xcbc_module_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("XCBC keyed hash algorithm"); MODULE_ALIAS_CRYPTO("xcbc"); MODULE_IMPORT_NS("CRYPTO_INTERNAL");
80 80 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 // SPDX-License-Identifier: GPL-2.0-only /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/module.h> #include <linux/spinlock.h> #include <linux/skbuff.h> #include <linux/if_arp.h> #include <linux/ip.h> #include <net/ipv6.h> #include <net/icmp.h> #include <net/udp.h> #include <net/tcp.h> #include <net/route.h> #include <linux/netfilter.h> #include <linux/netfilter_bridge.h> #include <linux/netfilter_ipv6.h> #include <linux/netfilter/xt_LOG.h> #include <net/netfilter/nf_log.h> static const struct nf_loginfo default_loginfo = { .type = NF_LOG_TYPE_LOG, .u = { .log = { .level = LOGLEVEL_NOTICE, .logflags = NF_LOG_DEFAULT_MASK, }, }, }; struct arppayload { unsigned char mac_src[ETH_ALEN]; unsigned char ip_src[4]; unsigned char mac_dst[ETH_ALEN]; unsigned char ip_dst[4]; }; /* Guard against containers flooding syslog. */ static bool nf_log_allowed(const struct net *net) { return net_eq(net, &init_net) || sysctl_nf_log_all_netns; } static void nf_log_dump_vlan(struct nf_log_buf *m, const struct sk_buff *skb) { u16 vid; if (!skb_vlan_tag_present(skb)) return; vid = skb_vlan_tag_get(skb); nf_log_buf_add(m, "VPROTO=%04x VID=%u ", ntohs(skb->vlan_proto), vid); } static void noinline_for_stack dump_arp_packet(struct nf_log_buf *m, const struct nf_loginfo *info, const struct sk_buff *skb, unsigned int nhoff) { const struct arppayload *ap; struct arppayload _arpp; const struct arphdr *ah; unsigned int logflags; struct arphdr _arph; ah = skb_header_pointer(skb, nhoff, sizeof(_arph), &_arph); if (!ah) { nf_log_buf_add(m, "TRUNCATED"); return; } if (info->type == NF_LOG_TYPE_LOG) logflags = info->u.log.logflags; else logflags = NF_LOG_DEFAULT_MASK; if (logflags & NF_LOG_MACDECODE) { nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM ", eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest); nf_log_dump_vlan(m, skb); nf_log_buf_add(m, "MACPROTO=%04x ", ntohs(eth_hdr(skb)->h_proto)); } nf_log_buf_add(m, "ARP HTYPE=%d PTYPE=0x%04x OPCODE=%d", ntohs(ah->ar_hrd), ntohs(ah->ar_pro), ntohs(ah->ar_op)); /* If it's for Ethernet and the lengths are OK, then log the ARP * payload. */ if (ah->ar_hrd != htons(ARPHRD_ETHER) || ah->ar_hln != ETH_ALEN || ah->ar_pln != sizeof(__be32)) return; ap = skb_header_pointer(skb, nhoff + sizeof(_arph), sizeof(_arpp), &_arpp); if (!ap) { nf_log_buf_add(m, " INCOMPLETE [%zu bytes]", skb->len - sizeof(_arph)); return; } nf_log_buf_add(m, " MACSRC=%pM IPSRC=%pI4 MACDST=%pM IPDST=%pI4", ap->mac_src, ap->ip_src, ap->mac_dst, ap->ip_dst); } static void nf_log_dump_packet_common(struct nf_log_buf *m, u8 pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, const struct nf_loginfo *loginfo, const char *prefix, struct net *net) { const struct net_device *physoutdev __maybe_unused; const struct net_device *physindev __maybe_unused; nf_log_buf_add(m, KERN_SOH "%c%sIN=%s OUT=%s ", '0' + loginfo->u.log.level, prefix, in ? in->name : "", out ? out->name : ""); #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) physindev = nf_bridge_get_physindev(skb, net); if (physindev && in != physindev) nf_log_buf_add(m, "PHYSIN=%s ", physindev->name); physoutdev = nf_bridge_get_physoutdev(skb); if (physoutdev && out != physoutdev) nf_log_buf_add(m, "PHYSOUT=%s ", physoutdev->name); #endif } static void nf_log_arp_packet(struct net *net, u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, const struct nf_loginfo *loginfo, const char *prefix) { struct nf_log_buf *m; if (!nf_log_allowed(net)) return; m = nf_log_buf_open(); if (!loginfo) loginfo = &default_loginfo; nf_log_dump_packet_common(m, pf, hooknum, skb, in, out, loginfo, prefix, net); dump_arp_packet(m, loginfo, skb, skb_network_offset(skb)); nf_log_buf_close(m); } static struct nf_logger nf_arp_logger __read_mostly = { .name = "nf_log_arp", .type = NF_LOG_TYPE_LOG, .logfn = nf_log_arp_packet, .me = THIS_MODULE, }; static void nf_log_dump_sk_uid_gid(struct net *net, struct nf_log_buf *m, struct sock *sk) { if (!sk || !sk_fullsock(sk) || !net_eq(net, sock_net(sk))) return; read_lock_bh(&sk->sk_callback_lock); if (sk->sk_socket && sk->sk_socket->file) { const struct cred *cred = sk->sk_socket->file->f_cred; nf_log_buf_add(m, "UID=%u GID=%u ", from_kuid_munged(&init_user_ns, cred->fsuid), from_kgid_munged(&init_user_ns, cred->fsgid)); } read_unlock_bh(&sk->sk_callback_lock); } static noinline_for_stack int nf_log_dump_tcp_header(struct nf_log_buf *m, const struct sk_buff *skb, u8 proto, int fragment, unsigned int offset, unsigned int logflags) { struct tcphdr _tcph; const struct tcphdr *th; /* Max length: 10 "PROTO=TCP " */ nf_log_buf_add(m, "PROTO=TCP "); if (fragment) return 0; /* Max length: 25 "INCOMPLETE [65535 bytes] " */ th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph); if (!th) { nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", skb->len - offset); return 1; } /* Max length: 20 "SPT=65535 DPT=65535 " */ nf_log_buf_add(m, "SPT=%u DPT=%u ", ntohs(th->source), ntohs(th->dest)); /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ if (logflags & NF_LOG_TCPSEQ) { nf_log_buf_add(m, "SEQ=%u ACK=%u ", ntohl(th->seq), ntohl(th->ack_seq)); } /* Max length: 13 "WINDOW=65535 " */ nf_log_buf_add(m, "WINDOW=%u ", ntohs(th->window)); /* Max length: 9 "RES=0x3C " */ nf_log_buf_add(m, "RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22)); /* Max length: 35 "AE CWR ECE URG ACK PSH RST SYN FIN " */ if (th->ae) nf_log_buf_add(m, "AE "); if (th->cwr) nf_log_buf_add(m, "CWR "); if (th->ece) nf_log_buf_add(m, "ECE "); if (th->urg) nf_log_buf_add(m, "URG "); if (th->ack) nf_log_buf_add(m, "ACK "); if (th->psh) nf_log_buf_add(m, "PSH "); if (th->rst) nf_log_buf_add(m, "RST "); if (th->syn) nf_log_buf_add(m, "SYN "); if (th->fin) nf_log_buf_add(m, "FIN "); /* Max length: 11 "URGP=65535 " */ nf_log_buf_add(m, "URGP=%u ", ntohs(th->urg_ptr)); if ((logflags & NF_LOG_TCPOPT) && th->doff * 4 > sizeof(struct tcphdr)) { unsigned int optsize = th->doff * 4 - sizeof(struct tcphdr); u8 _opt[60 - sizeof(struct tcphdr)]; unsigned int i; const u8 *op; op = skb_header_pointer(skb, offset + sizeof(struct tcphdr), optsize, _opt); if (!op) { nf_log_buf_add(m, "OPT (TRUNCATED)"); return 1; } /* Max length: 127 "OPT (" 15*4*2chars ") " */ nf_log_buf_add(m, "OPT ("); for (i = 0; i < optsize; i++) nf_log_buf_add(m, "%02X", op[i]); nf_log_buf_add(m, ") "); } return 0; } static noinline_for_stack int nf_log_dump_udp_header(struct nf_log_buf *m, const struct sk_buff *skb, u8 proto, int fragment, unsigned int offset) { struct udphdr _udph; const struct udphdr *uh; if (proto == IPPROTO_UDP) /* Max length: 10 "PROTO=UDP " */ nf_log_buf_add(m, "PROTO=UDP "); else /* Max length: 14 "PROTO=UDPLITE " */ nf_log_buf_add(m, "PROTO=UDPLITE "); if (fragment) goto out; /* Max length: 25 "INCOMPLETE [65535 bytes] " */ uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph); if (!uh) { nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", skb->len - offset); return 1; } /* Max length: 20 "SPT=65535 DPT=65535 " */ nf_log_buf_add(m, "SPT=%u DPT=%u LEN=%u ", ntohs(uh->source), ntohs(uh->dest), ntohs(uh->len)); out: return 0; } /* One level of recursion won't kill us */ static noinline_for_stack void dump_ipv4_packet(struct net *net, struct nf_log_buf *m, const struct nf_loginfo *info, const struct sk_buff *skb, unsigned int iphoff) { const struct iphdr *ih; unsigned int logflags; struct iphdr _iph; if (info->type == NF_LOG_TYPE_LOG) logflags = info->u.log.logflags; else logflags = NF_LOG_DEFAULT_MASK; ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph); if (!ih) { nf_log_buf_add(m, "TRUNCATED"); return; } /* Important fields: * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. * Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */ nf_log_buf_add(m, "SRC=%pI4 DST=%pI4 ", &ih->saddr, &ih->daddr); /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */ nf_log_buf_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ", iph_totlen(skb, ih), ih->tos & IPTOS_TOS_MASK, ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id)); /* Max length: 6 "CE DF MF " */ if (ntohs(ih->frag_off) & IP_CE) nf_log_buf_add(m, "CE "); if (ntohs(ih->frag_off) & IP_DF) nf_log_buf_add(m, "DF "); if (ntohs(ih->frag_off) & IP_MF) nf_log_buf_add(m, "MF "); /* Max length: 11 "FRAG:65535 " */ if (ntohs(ih->frag_off) & IP_OFFSET) nf_log_buf_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); if ((logflags & NF_LOG_IPOPT) && ih->ihl * 4 > sizeof(struct iphdr)) { unsigned char _opt[4 * 15 - sizeof(struct iphdr)]; const unsigned char *op; unsigned int i, optsize; optsize = ih->ihl * 4 - sizeof(struct iphdr); op = skb_header_pointer(skb, iphoff + sizeof(_iph), optsize, _opt); if (!op) { nf_log_buf_add(m, "TRUNCATED"); return; } /* Max length: 127 "OPT (" 15*4*2chars ") " */ nf_log_buf_add(m, "OPT ("); for (i = 0; i < optsize; i++) nf_log_buf_add(m, "%02X", op[i]); nf_log_buf_add(m, ") "); } switch (ih->protocol) { case IPPROTO_TCP: if (nf_log_dump_tcp_header(m, skb, ih->protocol, ntohs(ih->frag_off) & IP_OFFSET, iphoff + ih->ihl * 4, logflags)) return; break; case IPPROTO_UDP: case IPPROTO_UDPLITE: if (nf_log_dump_udp_header(m, skb, ih->protocol, ntohs(ih->frag_off) & IP_OFFSET, iphoff + ih->ihl * 4)) return; break; case IPPROTO_ICMP: { static const size_t required_len[NR_ICMP_TYPES + 1] = { [ICMP_ECHOREPLY] = 4, [ICMP_DEST_UNREACH] = 8 + sizeof(struct iphdr), [ICMP_SOURCE_QUENCH] = 8 + sizeof(struct iphdr), [ICMP_REDIRECT] = 8 + sizeof(struct iphdr), [ICMP_ECHO] = 4, [ICMP_TIME_EXCEEDED] = 8 + sizeof(struct iphdr), [ICMP_PARAMETERPROB] = 8 + sizeof(struct iphdr), [ICMP_TIMESTAMP] = 20, [ICMP_TIMESTAMPREPLY] = 20, [ICMP_ADDRESS] = 12, [ICMP_ADDRESSREPLY] = 12 }; const struct icmphdr *ich; struct icmphdr _icmph; /* Max length: 11 "PROTO=ICMP " */ nf_log_buf_add(m, "PROTO=ICMP "); if (ntohs(ih->frag_off) & IP_OFFSET) break; /* Max length: 25 "INCOMPLETE [65535 bytes] " */ ich = skb_header_pointer(skb, iphoff + ih->ihl * 4, sizeof(_icmph), &_icmph); if (!ich) { nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", skb->len - iphoff - ih->ihl * 4); break; } /* Max length: 18 "TYPE=255 CODE=255 " */ nf_log_buf_add(m, "TYPE=%u CODE=%u ", ich->type, ich->code); /* Max length: 25 "INCOMPLETE [65535 bytes] " */ if (ich->type <= NR_ICMP_TYPES && required_len[ich->type] && skb->len - iphoff - ih->ihl * 4 < required_len[ich->type]) { nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", skb->len - iphoff - ih->ihl * 4); break; } switch (ich->type) { case ICMP_ECHOREPLY: case ICMP_ECHO: /* Max length: 19 "ID=65535 SEQ=65535 " */ nf_log_buf_add(m, "ID=%u SEQ=%u ", ntohs(ich->un.echo.id), ntohs(ich->un.echo.sequence)); break; case ICMP_PARAMETERPROB: /* Max length: 14 "PARAMETER=255 " */ nf_log_buf_add(m, "PARAMETER=%u ", ntohl(ich->un.gateway) >> 24); break; case ICMP_REDIRECT: /* Max length: 24 "GATEWAY=255.255.255.255 " */ nf_log_buf_add(m, "GATEWAY=%pI4 ", &ich->un.gateway); fallthrough; case ICMP_DEST_UNREACH: case ICMP_SOURCE_QUENCH: case ICMP_TIME_EXCEEDED: /* Max length: 3+maxlen */ if (!iphoff) { /* Only recurse once. */ nf_log_buf_add(m, "["); dump_ipv4_packet(net, m, info, skb, iphoff + ih->ihl * 4 + sizeof(_icmph)); nf_log_buf_add(m, "] "); } /* Max length: 10 "MTU=65535 " */ if (ich->type == ICMP_DEST_UNREACH && ich->code == ICMP_FRAG_NEEDED) { nf_log_buf_add(m, "MTU=%u ", ntohs(ich->un.frag.mtu)); } } break; } /* Max Length */ case IPPROTO_AH: { const struct ip_auth_hdr *ah; struct ip_auth_hdr _ahdr; if (ntohs(ih->frag_off) & IP_OFFSET) break; /* Max length: 9 "PROTO=AH " */ nf_log_buf_add(m, "PROTO=AH "); /* Max length: 25 "INCOMPLETE [65535 bytes] " */ ah = skb_header_pointer(skb, iphoff + ih->ihl * 4, sizeof(_ahdr), &_ahdr); if (!ah) { nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", skb->len - iphoff - ih->ihl * 4); break; } /* Length: 15 "SPI=0xF1234567 " */ nf_log_buf_add(m, "SPI=0x%x ", ntohl(ah->spi)); break; } case IPPROTO_ESP: { const struct ip_esp_hdr *eh; struct ip_esp_hdr _esph; /* Max length: 10 "PROTO=ESP " */ nf_log_buf_add(m, "PROTO=ESP "); if (ntohs(ih->frag_off) & IP_OFFSET) break; /* Max length: 25 "INCOMPLETE [65535 bytes] " */ eh = skb_header_pointer(skb, iphoff + ih->ihl * 4, sizeof(_esph), &_esph); if (!eh) { nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", skb->len - iphoff - ih->ihl * 4); break; } /* Length: 15 "SPI=0xF1234567 " */ nf_log_buf_add(m, "SPI=0x%x ", ntohl(eh->spi)); break; } /* Max length: 10 "PROTO 255 " */ default: nf_log_buf_add(m, "PROTO=%u ", ih->protocol); } /* Max length: 15 "UID=4294967295 " */ if ((logflags & NF_LOG_UID) && !iphoff) nf_log_dump_sk_uid_gid(net, m, skb->sk); /* Max length: 16 "MARK=0xFFFFFFFF " */ if (!iphoff && skb->mark) nf_log_buf_add(m, "MARK=0x%x ", skb->mark); /* Proto Max log string length */ /* IP: 40+46+6+11+127 = 230 */ /* TCP: 10+max(25,20+30+13+9+35+11+127) = 255 */ /* UDP: 10+max(25,20) = 35 */ /* UDPLITE: 14+max(25,20) = 39 */ /* ICMP: 11+max(25, 18+25+max(19,14,24+3+n+10,3+n+10)) = 91+n */ /* ESP: 10+max(25)+15 = 50 */ /* AH: 9+max(25)+15 = 49 */ /* unknown: 10 */ /* (ICMP allows recursion one level deep) */ /* maxlen = IP + ICMP + IP + max(TCP,UDP,ICMP,unknown) */ /* maxlen = 230+ 91 + 230 + 255 = 806 */ } static noinline_for_stack void dump_ipv6_packet(struct net *net, struct nf_log_buf *m, const struct nf_loginfo *info, const struct sk_buff *skb, unsigned int ip6hoff, int recurse) { const struct ipv6hdr *ih; unsigned int hdrlen = 0; unsigned int logflags; struct ipv6hdr _ip6h; unsigned int ptr; u8 currenthdr; int fragment; if (info->type == NF_LOG_TYPE_LOG) logflags = info->u.log.logflags; else logflags = NF_LOG_DEFAULT_MASK; ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h); if (!ih) { nf_log_buf_add(m, "TRUNCATED"); return; } /* Max length: 88 "SRC=0000.0000.0000.0000.0000.0000.0000.0000 DST=0000.0000.0000.0000.0000.0000.0000.0000 " */ nf_log_buf_add(m, "SRC=%pI6 DST=%pI6 ", &ih->saddr, &ih->daddr); /* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */ nf_log_buf_add(m, "LEN=%zu TC=%u HOPLIMIT=%u FLOWLBL=%u ", ntohs(ih->payload_len) + sizeof(struct ipv6hdr), (ntohl(*(__be32 *)ih) & 0x0ff00000) >> 20, ih->hop_limit, (ntohl(*(__be32 *)ih) & 0x000fffff)); fragment = 0; ptr = ip6hoff + sizeof(struct ipv6hdr); currenthdr = ih->nexthdr; while (currenthdr != NEXTHDR_NONE && nf_ip6_ext_hdr(currenthdr)) { struct ipv6_opt_hdr _hdr; const struct ipv6_opt_hdr *hp; hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr); if (!hp) { nf_log_buf_add(m, "TRUNCATED"); return; } /* Max length: 48 "OPT (...) " */ if (logflags & NF_LOG_IPOPT) nf_log_buf_add(m, "OPT ( "); switch (currenthdr) { case IPPROTO_FRAGMENT: { struct frag_hdr _fhdr; const struct frag_hdr *fh; nf_log_buf_add(m, "FRAG:"); fh = skb_header_pointer(skb, ptr, sizeof(_fhdr), &_fhdr); if (!fh) { nf_log_buf_add(m, "TRUNCATED "); return; } /* Max length: 6 "65535 " */ nf_log_buf_add(m, "%u ", ntohs(fh->frag_off) & 0xFFF8); /* Max length: 11 "INCOMPLETE " */ if (fh->frag_off & htons(0x0001)) nf_log_buf_add(m, "INCOMPLETE "); nf_log_buf_add(m, "ID:%08x ", ntohl(fh->identification)); if (ntohs(fh->frag_off) & 0xFFF8) fragment = 1; hdrlen = 8; break; } case IPPROTO_DSTOPTS: case IPPROTO_ROUTING: case IPPROTO_HOPOPTS: if (fragment) { if (logflags & NF_LOG_IPOPT) nf_log_buf_add(m, ")"); return; } hdrlen = ipv6_optlen(hp); break; /* Max Length */ case IPPROTO_AH: if (logflags & NF_LOG_IPOPT) { struct ip_auth_hdr _ahdr; const struct ip_auth_hdr *ah; /* Max length: 3 "AH " */ nf_log_buf_add(m, "AH "); if (fragment) { nf_log_buf_add(m, ")"); return; } ah = skb_header_pointer(skb, ptr, sizeof(_ahdr), &_ahdr); if (!ah) { /* Max length: 26 "INCOMPLETE [65535 bytes] )" */ nf_log_buf_add(m, "INCOMPLETE [%u bytes] )", skb->len - ptr); return; } /* Length: 15 "SPI=0xF1234567 */ nf_log_buf_add(m, "SPI=0x%x ", ntohl(ah->spi)); } hdrlen = ipv6_authlen(hp); break; case IPPROTO_ESP: if (logflags & NF_LOG_IPOPT) { struct ip_esp_hdr _esph; const struct ip_esp_hdr *eh; /* Max length: 4 "ESP " */ nf_log_buf_add(m, "ESP "); if (fragment) { nf_log_buf_add(m, ")"); return; } /* Max length: 26 "INCOMPLETE [65535 bytes] )" */ eh = skb_header_pointer(skb, ptr, sizeof(_esph), &_esph); if (!eh) { nf_log_buf_add(m, "INCOMPLETE [%u bytes] )", skb->len - ptr); return; } /* Length: 16 "SPI=0xF1234567 )" */ nf_log_buf_add(m, "SPI=0x%x )", ntohl(eh->spi)); } return; default: /* Max length: 20 "Unknown Ext Hdr 255" */ nf_log_buf_add(m, "Unknown Ext Hdr %u", currenthdr); return; } if (logflags & NF_LOG_IPOPT) nf_log_buf_add(m, ") "); currenthdr = hp->nexthdr; ptr += hdrlen; } switch (currenthdr) { case IPPROTO_TCP: if (nf_log_dump_tcp_header(m, skb, currenthdr, fragment, ptr, logflags)) return; break; case IPPROTO_UDP: case IPPROTO_UDPLITE: if (nf_log_dump_udp_header(m, skb, currenthdr, fragment, ptr)) return; break; case IPPROTO_ICMPV6: { struct icmp6hdr _icmp6h; const struct icmp6hdr *ic; /* Max length: 13 "PROTO=ICMPv6 " */ nf_log_buf_add(m, "PROTO=ICMPv6 "); if (fragment) break; /* Max length: 25 "INCOMPLETE [65535 bytes] " */ ic = skb_header_pointer(skb, ptr, sizeof(_icmp6h), &_icmp6h); if (!ic) { nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", skb->len - ptr); return; } /* Max length: 18 "TYPE=255 CODE=255 " */ nf_log_buf_add(m, "TYPE=%u CODE=%u ", ic->icmp6_type, ic->icmp6_code); switch (ic->icmp6_type) { case ICMPV6_ECHO_REQUEST: case ICMPV6_ECHO_REPLY: /* Max length: 19 "ID=65535 SEQ=65535 " */ nf_log_buf_add(m, "ID=%u SEQ=%u ", ntohs(ic->icmp6_identifier), ntohs(ic->icmp6_sequence)); break; case ICMPV6_MGM_QUERY: case ICMPV6_MGM_REPORT: case ICMPV6_MGM_REDUCTION: break; case ICMPV6_PARAMPROB: /* Max length: 17 "POINTER=ffffffff " */ nf_log_buf_add(m, "POINTER=%08x ", ntohl(ic->icmp6_pointer)); fallthrough; case ICMPV6_DEST_UNREACH: case ICMPV6_PKT_TOOBIG: case ICMPV6_TIME_EXCEED: /* Max length: 3+maxlen */ if (recurse) { nf_log_buf_add(m, "["); dump_ipv6_packet(net, m, info, skb, ptr + sizeof(_icmp6h), 0); nf_log_buf_add(m, "] "); } /* Max length: 10 "MTU=65535 " */ if (ic->icmp6_type == ICMPV6_PKT_TOOBIG) { nf_log_buf_add(m, "MTU=%u ", ntohl(ic->icmp6_mtu)); } } break; } /* Max length: 10 "PROTO=255 " */ default: nf_log_buf_add(m, "PROTO=%u ", currenthdr); } /* Max length: 15 "UID=4294967295 " */ if ((logflags & NF_LOG_UID) && recurse) nf_log_dump_sk_uid_gid(net, m, skb->sk); /* Max length: 16 "MARK=0xFFFFFFFF " */ if (recurse && skb->mark) nf_log_buf_add(m, "MARK=0x%x ", skb->mark); } static void dump_mac_header(struct nf_log_buf *m, const struct nf_loginfo *info, const struct sk_buff *skb) { struct net_device *dev = skb->dev; unsigned int logflags = 0; if (info->type == NF_LOG_TYPE_LOG) logflags = info->u.log.logflags; if (!(logflags & NF_LOG_MACDECODE)) goto fallback; switch (dev->type) { case ARPHRD_ETHER: nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM ", eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest); nf_log_dump_vlan(m, skb); nf_log_buf_add(m, "MACPROTO=%04x ", ntohs(eth_hdr(skb)->h_proto)); return; default: break; } fallback: nf_log_buf_add(m, "MAC="); if (dev->hard_header_len && skb->mac_header != skb->network_header) { const unsigned char *p = skb_mac_header(skb); unsigned int i; if (dev->type == ARPHRD_SIT) { p -= ETH_HLEN; if (p < skb->head) p = NULL; } if (p) { nf_log_buf_add(m, "%02x", *p++); for (i = 1; i < dev->hard_header_len; i++) nf_log_buf_add(m, ":%02x", *p++); } if (dev->type == ARPHRD_SIT) { const struct iphdr *iph = (struct iphdr *)skb_mac_header(skb); nf_log_buf_add(m, " TUNNEL=%pI4->%pI4", &iph->saddr, &iph->daddr); } } nf_log_buf_add(m, " "); } static void nf_log_ip_packet(struct net *net, u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, const struct nf_loginfo *loginfo, const char *prefix) { struct nf_log_buf *m; if (!nf_log_allowed(net)) return; m = nf_log_buf_open(); if (!loginfo) loginfo = &default_loginfo; nf_log_dump_packet_common(m, pf, hooknum, skb, in, out, loginfo, prefix, net); if (in) dump_mac_header(m, loginfo, skb); dump_ipv4_packet(net, m, loginfo, skb, skb_network_offset(skb)); nf_log_buf_close(m); } static struct nf_logger nf_ip_logger __read_mostly = { .name = "nf_log_ipv4", .type = NF_LOG_TYPE_LOG, .logfn = nf_log_ip_packet, .me = THIS_MODULE, }; static void nf_log_ip6_packet(struct net *net, u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, const struct nf_loginfo *loginfo, const char *prefix) { struct nf_log_buf *m; if (!nf_log_allowed(net)) return; m = nf_log_buf_open(); if (!loginfo) loginfo = &default_loginfo; nf_log_dump_packet_common(m, pf, hooknum, skb, in, out, loginfo, prefix, net); if (in) dump_mac_header(m, loginfo, skb); dump_ipv6_packet(net, m, loginfo, skb, skb_network_offset(skb), 1); nf_log_buf_close(m); } static struct nf_logger nf_ip6_logger __read_mostly = { .name = "nf_log_ipv6", .type = NF_LOG_TYPE_LOG, .logfn = nf_log_ip6_packet, .me = THIS_MODULE, }; static void nf_log_unknown_packet(struct net *net, u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, const struct nf_loginfo *loginfo, const char *prefix) { struct nf_log_buf *m; if (!nf_log_allowed(net)) return; m = nf_log_buf_open(); if (!loginfo) loginfo = &default_loginfo; nf_log_dump_packet_common(m, pf, hooknum, skb, in, out, loginfo, prefix, net); dump_mac_header(m, loginfo, skb); nf_log_buf_close(m); } static void nf_log_netdev_packet(struct net *net, u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, const struct nf_loginfo *loginfo, const char *prefix) { switch (skb->protocol) { case htons(ETH_P_IP): nf_log_ip_packet(net, pf, hooknum, skb, in, out, loginfo, prefix); break; case htons(ETH_P_IPV6): nf_log_ip6_packet(net, pf, hooknum, skb, in, out, loginfo, prefix); break; case htons(ETH_P_ARP): case htons(ETH_P_RARP): nf_log_arp_packet(net, pf, hooknum, skb, in, out, loginfo, prefix); break; default: nf_log_unknown_packet(net, pf, hooknum, skb, in, out, loginfo, prefix); break; } } static struct nf_logger nf_netdev_logger __read_mostly = { .name = "nf_log_netdev", .type = NF_LOG_TYPE_LOG, .logfn = nf_log_netdev_packet, .me = THIS_MODULE, }; static struct nf_logger nf_bridge_logger __read_mostly = { .name = "nf_log_bridge", .type = NF_LOG_TYPE_LOG, .logfn = nf_log_netdev_packet, .me = THIS_MODULE, }; static int __net_init nf_log_syslog_net_init(struct net *net) { int ret = nf_log_set(net, NFPROTO_IPV4, &nf_ip_logger); if (ret) return ret; ret = nf_log_set(net, NFPROTO_ARP, &nf_arp_logger); if (ret) goto err1; ret = nf_log_set(net, NFPROTO_IPV6, &nf_ip6_logger); if (ret) goto err2; ret = nf_log_set(net, NFPROTO_NETDEV, &nf_netdev_logger); if (ret) goto err3; ret = nf_log_set(net, NFPROTO_BRIDGE, &nf_bridge_logger); if (ret) goto err4; return 0; err4: nf_log_unset(net, &nf_netdev_logger); err3: nf_log_unset(net, &nf_ip6_logger); err2: nf_log_unset(net, &nf_arp_logger); err1: nf_log_unset(net, &nf_ip_logger); return ret; } static void __net_exit nf_log_syslog_net_exit(struct net *net) { nf_log_unset(net, &nf_ip_logger); nf_log_unset(net, &nf_arp_logger); nf_log_unset(net, &nf_ip6_logger); nf_log_unset(net, &nf_netdev_logger); nf_log_unset(net, &nf_bridge_logger); } static struct pernet_operations nf_log_syslog_net_ops = { .init = nf_log_syslog_net_init, .exit = nf_log_syslog_net_exit, }; static int __init nf_log_syslog_init(void) { int ret; ret = register_pernet_subsys(&nf_log_syslog_net_ops); if (ret < 0) return ret; ret = nf_log_register(NFPROTO_IPV4, &nf_ip_logger); if (ret < 0) goto err1; ret = nf_log_register(NFPROTO_ARP, &nf_arp_logger); if (ret < 0) goto err2; ret = nf_log_register(NFPROTO_IPV6, &nf_ip6_logger); if (ret < 0) goto err3; ret = nf_log_register(NFPROTO_NETDEV, &nf_netdev_logger); if (ret < 0) goto err4; ret = nf_log_register(NFPROTO_BRIDGE, &nf_bridge_logger); if (ret < 0) goto err5; return 0; err5: nf_log_unregister(&nf_netdev_logger); err4: nf_log_unregister(&nf_ip6_logger); err3: nf_log_unregister(&nf_arp_logger); err2: nf_log_unregister(&nf_ip_logger); err1: pr_err("failed to register logger\n"); unregister_pernet_subsys(&nf_log_syslog_net_ops); return ret; } static void __exit nf_log_syslog_exit(void) { unregister_pernet_subsys(&nf_log_syslog_net_ops); nf_log_unregister(&nf_ip_logger); nf_log_unregister(&nf_arp_logger); nf_log_unregister(&nf_ip6_logger); nf_log_unregister(&nf_netdev_logger); nf_log_unregister(&nf_bridge_logger); } module_init(nf_log_syslog_init); module_exit(nf_log_syslog_exit); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); MODULE_DESCRIPTION("Netfilter syslog packet logging"); MODULE_LICENSE("GPL"); MODULE_ALIAS("nf_log_arp"); MODULE_ALIAS("nf_log_bridge"); MODULE_ALIAS("nf_log_ipv4"); MODULE_ALIAS("nf_log_ipv6"); MODULE_ALIAS("nf_log_netdev"); MODULE_ALIAS_NF_LOGGER(AF_BRIDGE, 0); MODULE_ALIAS_NF_LOGGER(AF_INET, 0); MODULE_ALIAS_NF_LOGGER(3, 0); MODULE_ALIAS_NF_LOGGER(5, 0); /* NFPROTO_NETDEV */ MODULE_ALIAS_NF_LOGGER(AF_INET6, 0);
484 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. * All Rights Reserved. */ #ifndef __XFS_BTREE_H__ #define __XFS_BTREE_H__ struct xfs_buf; struct xfs_inode; struct xfs_mount; struct xfs_trans; struct xfs_ifork; struct xfs_perag; /* * Generic key, ptr and record wrapper structures. * * These are disk format structures, and are converted where necessary * by the btree specific code that needs to interpret them. */ union xfs_btree_ptr { __be32 s; /* short form ptr */ __be64 l; /* long form ptr */ }; /* * The in-core btree key. Overlapping btrees actually store two keys * per pointer, so we reserve enough memory to hold both. The __*bigkey * items should never be accessed directly. */ union xfs_btree_key { struct xfs_bmbt_key bmbt; xfs_bmdr_key_t bmbr; /* bmbt root block */ xfs_alloc_key_t alloc; struct xfs_inobt_key inobt; struct xfs_rmap_key rmap; struct xfs_rmap_key __rmap_bigkey[2]; struct xfs_refcount_key refc; }; union xfs_btree_rec { struct xfs_bmbt_rec bmbt; xfs_bmdr_rec_t bmbr; /* bmbt root block */ struct xfs_alloc_rec alloc; struct xfs_inobt_rec inobt; struct xfs_rmap_rec rmap; struct xfs_refcount_rec refc; }; /* * This nonsense is to make -wlint happy. */ #define XFS_LOOKUP_EQ ((xfs_lookup_t)XFS_LOOKUP_EQi) #define XFS_LOOKUP_LE ((xfs_lookup_t)XFS_LOOKUP_LEi) #define XFS_LOOKUP_GE ((xfs_lookup_t)XFS_LOOKUP_GEi) struct xfs_btree_ops; uint32_t xfs_btree_magic(struct xfs_mount *mp, const struct xfs_btree_ops *ops); /* * For logging record fields. */ #define XFS_BB_MAGIC (1u << 0) #define XFS_BB_LEVEL (1u << 1) #define XFS_BB_NUMRECS (1u << 2) #define XFS_BB_LEFTSIB (1u << 3) #define XFS_BB_RIGHTSIB (1u << 4) #define XFS_BB_BLKNO (1u << 5) #define XFS_BB_LSN (1u << 6) #define XFS_BB_UUID (1u << 7) #define XFS_BB_OWNER (1u << 8) #define XFS_BB_NUM_BITS 5 #define XFS_BB_ALL_BITS ((1u << XFS_BB_NUM_BITS) - 1) #define XFS_BB_NUM_BITS_CRC 9 #define XFS_BB_ALL_BITS_CRC ((1u << XFS_BB_NUM_BITS_CRC) - 1) /* * Generic stats interface */ #define XFS_BTREE_STATS_INC(cur, stat) \ XFS_STATS_INC_OFF((cur)->bc_mp, \ (cur)->bc_ops->statoff + __XBTS_ ## stat) #define XFS_BTREE_STATS_ADD(cur, stat, val) \ XFS_STATS_ADD_OFF((cur)->bc_mp, \ (cur)->bc_ops->statoff + __XBTS_ ## stat, val) enum xbtree_key_contig { XBTREE_KEY_GAP = 0, XBTREE_KEY_CONTIGUOUS, XBTREE_KEY_OVERLAP, }; /* * Decide if these two numeric btree key fields are contiguous, overlapping, * or if there's a gap between them. @x should be the field from the high * key and @y should be the field from the low key. */ static inline enum xbtree_key_contig xbtree_key_contig(uint64_t x, uint64_t y) { x++; if (x < y) return XBTREE_KEY_GAP; if (x == y) return XBTREE_KEY_CONTIGUOUS; return XBTREE_KEY_OVERLAP; } #define XFS_BTREE_LONG_PTR_LEN (sizeof(__be64)) #define XFS_BTREE_SHORT_PTR_LEN (sizeof(__be32)) enum xfs_btree_type { XFS_BTREE_TYPE_AG, XFS_BTREE_TYPE_INODE, XFS_BTREE_TYPE_MEM, }; struct xfs_btree_ops { const char *name; /* Type of btree - AG-rooted or inode-rooted */ enum xfs_btree_type type; /* XFS_BTGEO_* flags that determine the geometry of the btree */ unsigned int geom_flags; /* size of the key, pointer, and record structures */ size_t key_len; size_t ptr_len; size_t rec_len; /* LRU refcount to set on each btree buffer created */ unsigned int lru_refs; /* offset of btree stats array */ unsigned int statoff; /* sick mask for health reporting (not for bmap btrees) */ unsigned int sick_mask; /* cursor operations */ struct xfs_btree_cur *(*dup_cursor)(struct xfs_btree_cur *); void (*update_cursor)(struct xfs_btree_cur *src, struct xfs_btree_cur *dst); /* update btree root pointer */ void (*set_root)(struct xfs_btree_cur *cur, const union xfs_btree_ptr *nptr, int level_change); /* block allocation / freeing */ int (*alloc_block)(struct xfs_btree_cur *cur, const union xfs_btree_ptr *start_bno, union xfs_btree_ptr *new_bno, int *stat); int (*free_block)(struct xfs_btree_cur *cur, struct xfs_buf *bp); /* records in block/level */ int (*get_minrecs)(struct xfs_btree_cur *cur, int level); int (*get_maxrecs)(struct xfs_btree_cur *cur, int level); /* records on disk. Matter for the root in inode case. */ int (*get_dmaxrecs)(struct xfs_btree_cur *cur, int level); /* init values of btree structures */ void (*init_key_from_rec)(union xfs_btree_key *key, const union xfs_btree_rec *rec); void (*init_rec_from_cur)(struct xfs_btree_cur *cur, union xfs_btree_rec *rec); void (*init_ptr_from_cur)(struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr); void (*init_high_key_from_rec)(union xfs_btree_key *key, const union xfs_btree_rec *rec); /* * Compare key value and cursor value -- positive if key > cur, * negative if key < cur, and zero if equal. */ int (*cmp_key_with_cur)(struct xfs_btree_cur *cur, const union xfs_btree_key *key); /* * Compare key1 and key2 -- positive if key1 > key2, negative if * key1 < key2, and zero if equal. If the @mask parameter is non NULL, * each key field to be used in the comparison must contain a nonzero * value. */ int (*cmp_two_keys)(struct xfs_btree_cur *cur, const union xfs_btree_key *key1, const union xfs_btree_key *key2, const union xfs_btree_key *mask); const struct xfs_buf_ops *buf_ops; /* check that k1 is lower than k2 */ int (*keys_inorder)(struct xfs_btree_cur *cur, const union xfs_btree_key *k1, const union xfs_btree_key *k2); /* check that r1 is lower than r2 */ int (*recs_inorder)(struct xfs_btree_cur *cur, const union xfs_btree_rec *r1, const union xfs_btree_rec *r2); /* * Are these two btree keys immediately adjacent? * * Given two btree keys @key1 and @key2, decide if it is impossible for * there to be a third btree key K satisfying the relationship * @key1 < K < @key2. To determine if two btree records are * immediately adjacent, @key1 should be the high key of the first * record and @key2 should be the low key of the second record. * If the @mask parameter is non NULL, each key field to be used in the * comparison must contain a nonzero value. */ enum xbtree_key_contig (*keys_contiguous)(struct xfs_btree_cur *cur, const union xfs_btree_key *key1, const union xfs_btree_key *key2, const union xfs_btree_key *mask); /* * Reallocate the space for if_broot to fit the number of records. * Move the records and pointers in if_broot to fit the new size. When * shrinking this will eliminate holes between the records and pointers * created by the caller. When growing this will create holes to be * filled in by the caller. * * The caller must not request to add more records than would fit in * the on-disk inode root. If the if_broot is currently NULL, then if * we are adding records, one will be allocated. The caller must also * not request that the number of records go below zero, although it * can go to zero. */ struct xfs_btree_block *(*broot_realloc)(struct xfs_btree_cur *cur, unsigned int new_numrecs); }; /* btree geometry flags */ #define XFS_BTGEO_OVERLAPPING (1U << 0) /* overlapping intervals */ #define XFS_BTGEO_IROOT_RECORDS (1U << 1) /* iroot can store records */ union xfs_btree_irec { struct xfs_alloc_rec_incore a; struct xfs_bmbt_irec b; struct xfs_inobt_rec_incore i; struct xfs_rmap_irec r; struct xfs_refcount_irec rc; }; struct xfs_btree_level { /* buffer pointer */ struct xfs_buf *bp; /* key/record number */ uint16_t ptr; /* readahead info */ #define XFS_BTCUR_LEFTRA (1 << 0) /* left sibling has been read-ahead */ #define XFS_BTCUR_RIGHTRA (1 << 1) /* right sibling has been read-ahead */ uint16_t ra; }; /* * Btree cursor structure. * This collects all information needed by the btree code in one place. */ struct xfs_btree_cur { struct xfs_trans *bc_tp; /* transaction we're in, if any */ struct xfs_mount *bc_mp; /* file system mount struct */ const struct xfs_btree_ops *bc_ops; struct kmem_cache *bc_cache; /* cursor cache */ unsigned int bc_flags; /* btree features - below */ union xfs_btree_irec bc_rec; /* current insert/search record value */ uint8_t bc_nlevels; /* number of levels in the tree */ uint8_t bc_maxlevels; /* maximum levels for this btree type */ struct xfs_group *bc_group; /* per-type information */ union { struct { struct xfs_inode *ip; short forksize; char whichfork; struct xbtree_ifakeroot *ifake; /* for staging cursor */ } bc_ino; struct { struct xfs_buf *agbp; struct xbtree_afakeroot *afake; /* for staging cursor */ } bc_ag; struct { struct xfbtree *xfbtree; } bc_mem; }; /* per-format private data */ union { struct { int allocated; } bc_bmap; /* bmapbt */ struct { unsigned int nr_ops; /* # record updates */ unsigned int shape_changes; /* # of extent splits */ } bc_refc; /* refcountbt/rtrefcountbt */ }; /* Must be at the end of the struct! */ struct xfs_btree_level bc_levels[]; }; /* * Compute the size of a btree cursor that can handle a btree of a given * height. The bc_levels array handles node and leaf blocks, so its size * is exactly nlevels. */ static inline size_t xfs_btree_cur_sizeof(unsigned int nlevels) { return struct_size_t(struct xfs_btree_cur, bc_levels, nlevels); } /* cursor state flags */ /* * The root of this btree is a fakeroot structure so that we can stage a btree * rebuild without leaving it accessible via primary metadata. The ops struct * is dynamically allocated and must be freed when the cursor is deleted. */ #define XFS_BTREE_STAGING (1U << 0) /* We are converting a delalloc reservation (only for bmbt btrees) */ #define XFS_BTREE_BMBT_WASDEL (1U << 1) /* For extent swap, ignore owner check in verifier (only for bmbt btrees) */ #define XFS_BTREE_BMBT_INVALID_OWNER (1U << 2) /* Cursor is active (only for allocbt btrees) */ #define XFS_BTREE_ALLOCBT_ACTIVE (1U << 3) #define XFS_BTREE_NOERROR 0 #define XFS_BTREE_ERROR 1 /* * Convert from buffer to btree block header. */ #define XFS_BUF_TO_BLOCK(bp) ((struct xfs_btree_block *)((bp)->b_addr)) xfs_failaddr_t __xfs_btree_check_block(struct xfs_btree_cur *cur, struct xfs_btree_block *block, int level, struct xfs_buf *bp); int __xfs_btree_check_ptr(struct xfs_btree_cur *cur, const union xfs_btree_ptr *ptr, int index, int level); /* * Check that block header is ok. */ int xfs_btree_check_block( struct xfs_btree_cur *cur, /* btree cursor */ struct xfs_btree_block *block, /* generic btree block pointer */ int level, /* level of the btree block */ struct xfs_buf *bp); /* buffer containing block, if any */ /* * Delete the btree cursor. */ void xfs_btree_del_cursor( struct xfs_btree_cur *cur, /* btree cursor */ int error); /* del because of error */ /* * Duplicate the btree cursor. * Allocate a new one, copy the record, re-get the buffers. */ int /* error */ xfs_btree_dup_cursor( struct xfs_btree_cur *cur, /* input cursor */ struct xfs_btree_cur **ncur);/* output cursor */ /* * Compute first and last byte offsets for the fields given. * Interprets the offsets table, which contains struct field offsets. */ void xfs_btree_offsets( uint32_t fields, /* bitmask of fields */ const short *offsets,/* table of field offsets */ int nbits, /* number of bits to inspect */ int *first, /* output: first byte offset */ int *last); /* output: last byte offset */ /* * Initialise a new btree block header */ void xfs_btree_init_buf(struct xfs_mount *mp, struct xfs_buf *bp, const struct xfs_btree_ops *ops, __u16 level, __u16 numrecs, __u64 owner); void xfs_btree_init_block(struct xfs_mount *mp, struct xfs_btree_block *buf, const struct xfs_btree_ops *ops, __u16 level, __u16 numrecs, __u64 owner); /* * Common btree core entry points. */ int xfs_btree_increment(struct xfs_btree_cur *, int, int *); int xfs_btree_decrement(struct xfs_btree_cur *, int, int *); int xfs_btree_lookup(struct xfs_btree_cur *, xfs_lookup_t, int *); int xfs_btree_update(struct xfs_btree_cur *, union xfs_btree_rec *); int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *); int xfs_btree_insert(struct xfs_btree_cur *, int *); int xfs_btree_delete(struct xfs_btree_cur *, int *); int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *); int xfs_btree_change_owner(struct xfs_btree_cur *cur, uint64_t new_owner, struct list_head *buffer_list); /* * btree block CRC helpers */ void xfs_btree_fsblock_calc_crc(struct xfs_buf *); bool xfs_btree_fsblock_verify_crc(struct xfs_buf *); void xfs_btree_agblock_calc_crc(struct xfs_buf *); bool xfs_btree_agblock_verify_crc(struct xfs_buf *); /* * Internal btree helpers also used by xfs_bmap.c. */ void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, uint32_t); void xfs_btree_log_recs(struct xfs_btree_cur *, struct xfs_buf *, int, int); /* * Helpers. */ static inline int xfs_btree_get_numrecs(const struct xfs_btree_block *block) { return be16_to_cpu(block->bb_numrecs); } static inline void xfs_btree_set_numrecs(struct xfs_btree_block *block, uint16_t numrecs) { block->bb_numrecs = cpu_to_be16(numrecs); } static inline int xfs_btree_get_level(const struct xfs_btree_block *block) { return be16_to_cpu(block->bb_level); } /* * Min and max functions for extlen, agblock, fileoff, and filblks types. */ #define XFS_EXTLEN_MIN(a,b) min_t(xfs_extlen_t, (a), (b)) #define XFS_EXTLEN_MAX(a,b) max_t(xfs_extlen_t, (a), (b)) #define XFS_AGBLOCK_MIN(a,b) min_t(xfs_agblock_t, (a), (b)) #define XFS_AGBLOCK_MAX(a,b) max_t(xfs_agblock_t, (a), (b)) #define XFS_FILEOFF_MIN(a,b) min_t(xfs_fileoff_t, (a), (b)) #define XFS_FILEOFF_MAX(a,b) max_t(xfs_fileoff_t, (a), (b)) #define XFS_FILBLKS_MIN(a,b) min_t(xfs_filblks_t, (a), (b)) #define XFS_FILBLKS_MAX(a,b) max_t(xfs_filblks_t, (a), (b)) xfs_failaddr_t xfs_btree_agblock_v5hdr_verify(struct xfs_buf *bp); xfs_failaddr_t xfs_btree_agblock_verify(struct xfs_buf *bp, unsigned int max_recs); xfs_failaddr_t xfs_btree_fsblock_v5hdr_verify(struct xfs_buf *bp, uint64_t owner); xfs_failaddr_t xfs_btree_fsblock_verify(struct xfs_buf *bp, unsigned int max_recs); xfs_failaddr_t xfs_btree_memblock_verify(struct xfs_buf *bp, unsigned int max_recs); unsigned int xfs_btree_compute_maxlevels(const unsigned int *limits, unsigned long long records); unsigned long long xfs_btree_calc_size(const unsigned int *limits, unsigned long long records); unsigned int xfs_btree_space_to_height(const unsigned int *limits, unsigned long long blocks); /* * Return codes for the query range iterator function are 0 to continue * iterating, and non-zero to stop iterating. Any non-zero value will be * passed up to the _query_range caller. The special value -ECANCELED can be * used to stop iteration, because _query_range never generates that error * code on its own. */ typedef int (*xfs_btree_query_range_fn)(struct xfs_btree_cur *cur, const union xfs_btree_rec *rec, void *priv); int xfs_btree_query_range(struct xfs_btree_cur *cur, const union xfs_btree_irec *low_rec, const union xfs_btree_irec *high_rec, xfs_btree_query_range_fn fn, void *priv); int xfs_btree_query_all(struct xfs_btree_cur *cur, xfs_btree_query_range_fn fn, void *priv); typedef int (*xfs_btree_visit_blocks_fn)(struct xfs_btree_cur *cur, int level, void *data); /* Visit record blocks. */ #define XFS_BTREE_VISIT_RECORDS (1 << 0) /* Visit leaf blocks. */ #define XFS_BTREE_VISIT_LEAVES (1 << 1) /* Visit all blocks. */ #define XFS_BTREE_VISIT_ALL (XFS_BTREE_VISIT_RECORDS | \ XFS_BTREE_VISIT_LEAVES) int xfs_btree_visit_blocks(struct xfs_btree_cur *cur, xfs_btree_visit_blocks_fn fn, unsigned int flags, void *data); int xfs_btree_count_blocks(struct xfs_btree_cur *cur, xfs_filblks_t *blocks); union xfs_btree_rec *xfs_btree_rec_addr(struct xfs_btree_cur *cur, int n, struct xfs_btree_block *block); union xfs_btree_key *xfs_btree_key_addr(struct xfs_btree_cur *cur, int n, struct xfs_btree_block *block); union xfs_btree_key *xfs_btree_high_key_addr(struct xfs_btree_cur *cur, int n, struct xfs_btree_block *block); union xfs_btree_ptr *xfs_btree_ptr_addr(struct xfs_btree_cur *cur, int n, struct xfs_btree_block *block); int xfs_btree_lookup_get_block(struct xfs_btree_cur *cur, int level, const union xfs_btree_ptr *pp, struct xfs_btree_block **blkp); struct xfs_btree_block *xfs_btree_get_block(struct xfs_btree_cur *cur, int level, struct xfs_buf **bpp); bool xfs_btree_ptr_is_null(struct xfs_btree_cur *cur, const union xfs_btree_ptr *ptr); int xfs_btree_cmp_two_ptrs(struct xfs_btree_cur *cur, const union xfs_btree_ptr *a, const union xfs_btree_ptr *b); void xfs_btree_get_sibling(struct xfs_btree_cur *cur, struct xfs_btree_block *block, union xfs_btree_ptr *ptr, int lr); void xfs_btree_get_keys(struct xfs_btree_cur *cur, struct xfs_btree_block *block, union xfs_btree_key *key); union xfs_btree_key *xfs_btree_high_key_from_key(struct xfs_btree_cur *cur, union xfs_btree_key *key); typedef bool (*xfs_btree_key_gap_fn)(struct xfs_btree_cur *cur, const union xfs_btree_key *key1, const union xfs_btree_key *key2); int xfs_btree_has_records(struct xfs_btree_cur *cur, const union xfs_btree_irec *low, const union xfs_btree_irec *high, const union xfs_btree_key *mask, enum xbtree_recpacking *outcome); bool xfs_btree_has_more_records(struct xfs_btree_cur *cur); struct xfs_ifork *xfs_btree_ifork_ptr(struct xfs_btree_cur *cur); /* Key comparison helpers */ static inline bool xfs_btree_keycmp_lt( struct xfs_btree_cur *cur, const union xfs_btree_key *key1, const union xfs_btree_key *key2) { return cur->bc_ops->cmp_two_keys(cur, key1, key2, NULL) < 0; } static inline bool xfs_btree_keycmp_gt( struct xfs_btree_cur *cur, const union xfs_btree_key *key1, const union xfs_btree_key *key2) { return cur->bc_ops->cmp_two_keys(cur, key1, key2, NULL) > 0; } static inline bool xfs_btree_keycmp_eq( struct xfs_btree_cur *cur, const union xfs_btree_key *key1, const union xfs_btree_key *key2) { return cur->bc_ops->cmp_two_keys(cur, key1, key2, NULL) == 0; } static inline bool xfs_btree_keycmp_le( struct xfs_btree_cur *cur, const union xfs_btree_key *key1, const union xfs_btree_key *key2) { return !xfs_btree_keycmp_gt(cur, key1, key2); } static inline bool xfs_btree_keycmp_ge( struct xfs_btree_cur *cur, const union xfs_btree_key *key1, const union xfs_btree_key *key2) { return !xfs_btree_keycmp_lt(cur, key1, key2); } static inline bool xfs_btree_keycmp_ne( struct xfs_btree_cur *cur, const union xfs_btree_key *key1, const union xfs_btree_key *key2) { return !xfs_btree_keycmp_eq(cur, key1, key2); } /* Masked key comparison helpers */ static inline bool xfs_btree_masked_keycmp_lt( struct xfs_btree_cur *cur, const union xfs_btree_key *key1, const union xfs_btree_key *key2, const union xfs_btree_key *mask) { return cur->bc_ops->cmp_two_keys(cur, key1, key2, mask) < 0; } static inline bool xfs_btree_masked_keycmp_gt( struct xfs_btree_cur *cur, const union xfs_btree_key *key1, const union xfs_btree_key *key2, const union xfs_btree_key *mask) { return cur->bc_ops->cmp_two_keys(cur, key1, key2, mask) > 0; } static inline bool xfs_btree_masked_keycmp_ge( struct xfs_btree_cur *cur, const union xfs_btree_key *key1, const union xfs_btree_key *key2, const union xfs_btree_key *mask) { return !xfs_btree_masked_keycmp_lt(cur, key1, key2, mask); } /* Does this cursor point to the last block in the given level? */ static inline bool xfs_btree_islastblock( struct xfs_btree_cur *cur, int level) { struct xfs_btree_block *block; struct xfs_buf *bp; block = xfs_btree_get_block(cur, level, &bp); if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK); return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK); } void xfs_btree_set_ptr_null(struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr); int xfs_btree_get_buf_block(struct xfs_btree_cur *cur, const union xfs_btree_ptr *ptr, struct xfs_btree_block **block, struct xfs_buf **bpp); int xfs_btree_read_buf_block(struct xfs_btree_cur *cur, const union xfs_btree_ptr *ptr, int flags, struct xfs_btree_block **block, struct xfs_buf **bpp); void xfs_btree_set_sibling(struct xfs_btree_cur *cur, struct xfs_btree_block *block, const union xfs_btree_ptr *ptr, int lr); void xfs_btree_init_block_cur(struct xfs_btree_cur *cur, struct xfs_buf *bp, int level, int numrecs); void xfs_btree_copy_ptrs(struct xfs_btree_cur *cur, union xfs_btree_ptr *dst_ptr, const union xfs_btree_ptr *src_ptr, int numptrs); void xfs_btree_copy_keys(struct xfs_btree_cur *cur, union xfs_btree_key *dst_key, const union xfs_btree_key *src_key, int numkeys); void xfs_btree_init_ptr_from_cur(struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr); static inline struct xfs_btree_cur * xfs_btree_alloc_cursor( struct xfs_mount *mp, struct xfs_trans *tp, const struct xfs_btree_ops *ops, uint8_t maxlevels, struct kmem_cache *cache) { struct xfs_btree_cur *cur; ASSERT(ops->ptr_len == XFS_BTREE_LONG_PTR_LEN || ops->ptr_len == XFS_BTREE_SHORT_PTR_LEN); /* BMBT allocations can come through from non-transactional context. */ cur = kmem_cache_zalloc(cache, GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); cur->bc_ops = ops; cur->bc_tp = tp; cur->bc_mp = mp; cur->bc_maxlevels = maxlevels; cur->bc_cache = cache; return cur; } int __init xfs_btree_init_cur_caches(void); void xfs_btree_destroy_cur_caches(void); int xfs_btree_goto_left_edge(struct xfs_btree_cur *cur); /* Does this level of the cursor point to the inode root (and not a block)? */ static inline bool xfs_btree_at_iroot( const struct xfs_btree_cur *cur, int level) { return cur->bc_ops->type == XFS_BTREE_TYPE_INODE && level == cur->bc_nlevels - 1; } int xfs_btree_alloc_metafile_block(struct xfs_btree_cur *cur, const union xfs_btree_ptr *start, union xfs_btree_ptr *newp, int *stat); int xfs_btree_free_metafile_block(struct xfs_btree_cur *cur, struct xfs_buf *bp); #endif /* __XFS_BTREE_H__ */
4 10 4 11 11 156 2 152 8 169 6 6 6 6 169 174 5 5 1 1 175 168 8 23 26 20 41 34 24 26 23 41 37 203 22 184 12 193 203 215 209 209 14 122 83 134 84 2 2 137 2 181 107 14 177 7 182 111 70 162 183 111 79 171 213 22 4 4 4 3 1 4 1 3 2 1 2 2 2 1 2 1 2 2 1 1 1 1 13 3 2 1 2 1 1 1 1 1 2 184 66 189 51 203 13 184 65 192 50 214 214 1 15 15 1 1 15 15 1 14 160 21 21 20 10 1 11 11 11 214 215 123 123 91 90 1 215 200 16 214 215 16 201 213 215 178 38 215 215 215 215 41 5 185 1 185 5 182 72 213 1 13 13 13 213 203 13 205 14 3 211 103 182 27 2 2 183 66 205 11 259 259 243 177 178 4 87 86 193 183 174 13 160 83 74 80 21 21 21 175 174 50 42 2 40 40 47 156 1 75 75 74 2 2 132 121 24 49 175 175 175 175 6 1 170 146 94 174 4 40 40 40 39 20 57 4 4 4 4 4 6 6 6 6 39 39 39 57 4 4 4 6 39 57 49 4 35 18 55 1 52 2 34 20 36 18 49 2 2 50 4 52 2 51 3 52 2 52 2 52 2 50 4 49 3 2 48 3 3 50 4 50 4 47 4 3 49 5 54 2 2 29 58 3 55 57 57 57 57 61 2 59 60 60 61 1 1 3 1 3 3 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* COMMON Applications Kept Enhanced (CAKE) discipline * * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com> * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk> * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com> * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de> * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk> * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au> * * The CAKE Principles: * (or, how to have your cake and eat it too) * * This is a combination of several shaping, AQM and FQ techniques into one * easy-to-use package: * * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE * equipment and bloated MACs. This operates in deficit mode (as in sch_fq), * eliminating the need for any sort of burst parameter (eg. token bucket * depth). Burst support is limited to that necessary to overcome scheduling * latency. * * - A Diffserv-aware priority queue, giving more priority to certain classes, * up to a specified fraction of bandwidth. Above that bandwidth threshold, * the priority is reduced to avoid starving other tins. * * - Each priority tin has a separate Flow Queue system, to isolate traffic * flows from each other. This prevents a burst on one flow from increasing * the delay to another. Flows are distributed to queues using a * set-associative hash function. * * - Each queue is actively managed by Cobalt, which is a combination of the * Codel and Blue AQM algorithms. This serves flows fairly, and signals * congestion early via ECN (if available) and/or packet drops, to keep * latency low. The codel parameters are auto-tuned based on the bandwidth * setting, as is necessary at low bandwidths. * * The configuration parameters are kept deliberately simple for ease of use. * Everything has sane defaults. Complete generality of configuration is *not* * a goal. * * The priority queue operates according to a weighted DRR scheme, combined with * a bandwidth tracker which reuses the shaper logic to detect which side of the * bandwidth sharing threshold the tin is operating. This determines whether a * priority-based weight (high) or a bandwidth-based weight (low) is used for * that tin in the current pass. * * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly * granted us permission to leverage. */ #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/jiffies.h> #include <linux/string.h> #include <linux/in.h> #include <linux/errno.h> #include <linux/init.h> #include <linux/skbuff.h> #include <linux/jhash.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/reciprocal_div.h> #include <net/netlink.h> #include <linux/if_vlan.h> #include <net/gso.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <net/tcp.h> #include <net/flow_dissector.h> #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <net/netfilter/nf_conntrack_core.h> #endif #define CAKE_SET_WAYS (8) #define CAKE_MAX_TINS (8) #define CAKE_QUEUES (1024) #define CAKE_FLOW_MASK 63 #define CAKE_FLOW_NAT_FLAG 64 /* struct cobalt_params - contains codel and blue parameters * @interval: codel initial drop rate * @target: maximum persistent sojourn time & blue update rate * @mtu_time: serialisation delay of maximum-size packet * @p_inc: increment of blue drop probability (0.32 fxp) * @p_dec: decrement of blue drop probability (0.32 fxp) */ struct cobalt_params { u64 interval; u64 target; u64 mtu_time; u32 p_inc; u32 p_dec; }; /* struct cobalt_vars - contains codel and blue variables * @count: codel dropping frequency * @rec_inv_sqrt: reciprocal value of sqrt(count) >> 1 * @drop_next: time to drop next packet, or when we dropped last * @blue_timer: Blue time to next drop * @p_drop: BLUE drop probability (0.32 fxp) * @dropping: set if in dropping state * @ecn_marked: set if marked */ struct cobalt_vars { u32 count; u32 rec_inv_sqrt; ktime_t drop_next; ktime_t blue_timer; u32 p_drop; bool dropping; bool ecn_marked; }; enum { CAKE_SET_NONE = 0, CAKE_SET_SPARSE, CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */ CAKE_SET_BULK, CAKE_SET_DECAYING }; struct cake_flow { /* this stuff is all needed per-flow at dequeue time */ struct sk_buff *head; struct sk_buff *tail; struct list_head flowchain; s32 deficit; u32 dropped; struct cobalt_vars cvars; u16 srchost; /* index into cake_host table */ u16 dsthost; u8 set; }; /* please try to keep this structure <= 64 bytes */ struct cake_host { u32 srchost_tag; u32 dsthost_tag; u16 srchost_bulk_flow_count; u16 dsthost_bulk_flow_count; }; struct cake_heap_entry { u16 t:3, b:10; }; struct cake_tin_data { struct cake_flow flows[CAKE_QUEUES]; u32 backlogs[CAKE_QUEUES]; u32 tags[CAKE_QUEUES]; /* for set association */ u16 overflow_idx[CAKE_QUEUES]; struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */ u16 flow_quantum; struct cobalt_params cparams; u32 drop_overlimit; u16 bulk_flow_count; u16 sparse_flow_count; u16 decaying_flow_count; u16 unresponsive_flow_count; u32 max_skblen; struct list_head new_flows; struct list_head old_flows; struct list_head decaying_flows; /* time_next = time_this + ((len * rate_ns) >> rate_shft) */ ktime_t time_next_packet; u64 tin_rate_ns; u64 tin_rate_bps; u16 tin_rate_shft; u16 tin_quantum; s32 tin_deficit; u32 tin_backlog; u32 tin_dropped; u32 tin_ecn_mark; u32 packets; u64 bytes; u32 ack_drops; /* moving averages */ u64 avge_delay; u64 peak_delay; u64 base_delay; /* hash function stats */ u32 way_directs; u32 way_hits; u32 way_misses; u32 way_collisions; }; /* number of tins is small, so size of this struct doesn't matter much */ struct cake_sched_data { struct tcf_proto __rcu *filter_list; /* optional external classifier */ struct tcf_block *block; struct cake_tin_data *tins; struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS]; u16 overflow_timeout; u16 tin_cnt; u8 tin_mode; u8 flow_mode; u8 ack_filter; u8 atm_mode; u32 fwmark_mask; u16 fwmark_shft; /* time_next = time_this + ((len * rate_ns) >> rate_shft) */ u16 rate_shft; ktime_t time_next_packet; ktime_t failsafe_next_packet; u64 rate_ns; u64 rate_bps; u16 rate_flags; s16 rate_overhead; u16 rate_mpu; u64 interval; u64 target; /* resource tracking */ u32 buffer_used; u32 buffer_max_used; u32 buffer_limit; u32 buffer_config_limit; /* indices for dequeue */ u16 cur_tin; u16 cur_flow; struct qdisc_watchdog watchdog; const u8 *tin_index; const u8 *tin_order; /* bandwidth capacity estimate */ ktime_t last_packet_time; ktime_t avg_window_begin; u64 avg_packet_interval; u64 avg_window_bytes; u64 avg_peak_bandwidth; ktime_t last_reconfig_time; /* packet length stats */ u32 avg_netoff; u16 max_netlen; u16 max_adjlen; u16 min_netlen; u16 min_adjlen; }; enum { CAKE_FLAG_OVERHEAD = BIT(0), CAKE_FLAG_AUTORATE_INGRESS = BIT(1), CAKE_FLAG_INGRESS = BIT(2), CAKE_FLAG_WASH = BIT(3), CAKE_FLAG_SPLIT_GSO = BIT(4) }; /* COBALT operates the Codel and BLUE algorithms in parallel, in order to * obtain the best features of each. Codel is excellent on flows which * respond to congestion signals in a TCP-like way. BLUE is more effective on * unresponsive flows. */ struct cobalt_skb_cb { ktime_t enqueue_time; u32 adjusted_len; }; static u64 us_to_ns(u64 us) { return us * NSEC_PER_USEC; } static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb) { qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb)); return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data; } static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb) { return get_cobalt_cb(skb)->enqueue_time; } static void cobalt_set_enqueue_time(struct sk_buff *skb, ktime_t now) { get_cobalt_cb(skb)->enqueue_time = now; } static u16 quantum_div[CAKE_QUEUES + 1] = {0}; /* Diffserv lookup tables */ static const u8 precedence[] = { 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, }; static const u8 diffserv8[] = { 2, 0, 1, 2, 4, 2, 2, 2, 1, 2, 1, 2, 1, 2, 1, 2, 5, 2, 4, 2, 4, 2, 4, 2, 3, 2, 3, 2, 3, 2, 3, 2, 6, 2, 3, 2, 3, 2, 3, 2, 6, 2, 2, 2, 6, 2, 6, 2, 7, 2, 2, 2, 2, 2, 2, 2, 7, 2, 2, 2, 2, 2, 2, 2, }; static const u8 diffserv4[] = { 0, 1, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 3, 0, 2, 0, 2, 0, 2, 0, 3, 0, 0, 0, 3, 0, 3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, }; static const u8 diffserv3[] = { 0, 1, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, }; static const u8 besteffort[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; /* tin priority order for stats dumping */ static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7}; static const u8 bulk_order[] = {1, 0, 2, 3}; /* There is a big difference in timing between the accurate values placed in the * cache and the approximations given by a single Newton step for small count * values, particularly when stepping from count 1 to 2 or vice versa. Hence, * these values are calculated using eight Newton steps, using the * implementation below. Above 16, a single Newton step gives sufficient * accuracy in either direction, given the precision stored. * * The magnitude of the error when stepping up to count 2 is such as to give the * value that *should* have been produced at count 4. */ #define REC_INV_SQRT_CACHE (16) static const u32 inv_sqrt_cache[REC_INV_SQRT_CACHE] = { ~0, ~0, 3037000500, 2479700525, 2147483647, 1920767767, 1753413056, 1623345051, 1518500250, 1431655765, 1358187914, 1294981364, 1239850263, 1191209601, 1147878294, 1108955788 }; /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2) * * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32 */ static void cobalt_newton_step(struct cobalt_vars *vars) { u32 invsqrt, invsqrt2; u64 val; invsqrt = vars->rec_inv_sqrt; invsqrt2 = ((u64)invsqrt * invsqrt) >> 32; val = (3LL << 32) - ((u64)vars->count * invsqrt2); val >>= 2; /* avoid overflow in following multiply */ val = (val * invsqrt) >> (32 - 2 + 1); vars->rec_inv_sqrt = val; } static void cobalt_invsqrt(struct cobalt_vars *vars) { if (vars->count < REC_INV_SQRT_CACHE) vars->rec_inv_sqrt = inv_sqrt_cache[vars->count]; else cobalt_newton_step(vars); } static void cobalt_vars_init(struct cobalt_vars *vars) { memset(vars, 0, sizeof(*vars)); } /* CoDel control_law is t + interval/sqrt(count) * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid * both sqrt() and divide operation. */ static ktime_t cobalt_control(ktime_t t, u64 interval, u32 rec_inv_sqrt) { return ktime_add_ns(t, reciprocal_scale(interval, rec_inv_sqrt)); } /* Call this when a packet had to be dropped due to queue overflow. Returns * true if the BLUE state was quiescent before but active after this call. */ static bool cobalt_queue_full(struct cobalt_vars *vars, struct cobalt_params *p, ktime_t now) { bool up = false; if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) { up = !vars->p_drop; vars->p_drop += p->p_inc; if (vars->p_drop < p->p_inc) vars->p_drop = ~0; vars->blue_timer = now; } vars->dropping = true; vars->drop_next = now; if (!vars->count) vars->count = 1; return up; } /* Call this when the queue was serviced but turned out to be empty. Returns * true if the BLUE state was active before but quiescent after this call. */ static bool cobalt_queue_empty(struct cobalt_vars *vars, struct cobalt_params *p, ktime_t now) { bool down = false; if (vars->p_drop && ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) { if (vars->p_drop < p->p_dec) vars->p_drop = 0; else vars->p_drop -= p->p_dec; vars->blue_timer = now; down = !vars->p_drop; } vars->dropping = false; if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) { vars->count--; cobalt_invsqrt(vars); vars->drop_next = cobalt_control(vars->drop_next, p->interval, vars->rec_inv_sqrt); } return down; } /* Call this with a freshly dequeued packet for possible congestion marking. * Returns true as an instruction to drop the packet, false for delivery. */ static enum skb_drop_reason cobalt_should_drop(struct cobalt_vars *vars, struct cobalt_params *p, ktime_t now, struct sk_buff *skb, u32 bulk_flows) { enum skb_drop_reason reason = SKB_NOT_DROPPED_YET; bool next_due, over_target; ktime_t schedule; u64 sojourn; /* The 'schedule' variable records, in its sign, whether 'now' is before or * after 'drop_next'. This allows 'drop_next' to be updated before the next * scheduling decision is actually branched, without destroying that * information. Similarly, the first 'schedule' value calculated is preserved * in the boolean 'next_due'. * * As for 'drop_next', we take advantage of the fact that 'interval' is both * the delay between first exceeding 'target' and the first signalling event, * *and* the scaling factor for the signalling frequency. It's therefore very * natural to use a single mechanism for both purposes, and eliminates a * significant amount of reference Codel's spaghetti code. To help with this, * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close * as possible to 1.0 in fixed-point. */ sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb))); schedule = ktime_sub(now, vars->drop_next); over_target = sojourn > p->target && sojourn > p->mtu_time * bulk_flows * 2 && sojourn > p->mtu_time * 4; next_due = vars->count && ktime_to_ns(schedule) >= 0; vars->ecn_marked = false; if (over_target) { if (!vars->dropping) { vars->dropping = true; vars->drop_next = cobalt_control(now, p->interval, vars->rec_inv_sqrt); } if (!vars->count) vars->count = 1; } else if (vars->dropping) { vars->dropping = false; } if (next_due && vars->dropping) { /* Use ECN mark if possible, otherwise drop */ if (!(vars->ecn_marked = INET_ECN_set_ce(skb))) reason = SKB_DROP_REASON_QDISC_CONGESTED; vars->count++; if (!vars->count) vars->count--; cobalt_invsqrt(vars); vars->drop_next = cobalt_control(vars->drop_next, p->interval, vars->rec_inv_sqrt); schedule = ktime_sub(now, vars->drop_next); } else { while (next_due) { vars->count--; cobalt_invsqrt(vars); vars->drop_next = cobalt_control(vars->drop_next, p->interval, vars->rec_inv_sqrt); schedule = ktime_sub(now, vars->drop_next); next_due = vars->count && ktime_to_ns(schedule) >= 0; } } /* Simple BLUE implementation. Lack of ECN is deliberate. */ if (vars->p_drop && reason == SKB_NOT_DROPPED_YET && get_random_u32() < vars->p_drop) reason = SKB_DROP_REASON_CAKE_FLOOD; /* Overload the drop_next field as an activity timeout */ if (!vars->count) vars->drop_next = ktime_add_ns(now, p->interval); else if (ktime_to_ns(schedule) > 0 && reason == SKB_NOT_DROPPED_YET) vars->drop_next = now; return reason; } static bool cake_update_flowkeys(struct flow_keys *keys, const struct sk_buff *skb) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) struct nf_conntrack_tuple tuple = {}; bool rev = !skb->_nfct, upd = false; __be32 ip; if (skb_protocol(skb, true) != htons(ETH_P_IP)) return false; if (!nf_ct_get_tuple_skb(&tuple, skb)) return false; ip = rev ? tuple.dst.u3.ip : tuple.src.u3.ip; if (ip != keys->addrs.v4addrs.src) { keys->addrs.v4addrs.src = ip; upd = true; } ip = rev ? tuple.src.u3.ip : tuple.dst.u3.ip; if (ip != keys->addrs.v4addrs.dst) { keys->addrs.v4addrs.dst = ip; upd = true; } if (keys->ports.ports) { __be16 port; port = rev ? tuple.dst.u.all : tuple.src.u.all; if (port != keys->ports.src) { keys->ports.src = port; upd = true; } port = rev ? tuple.src.u.all : tuple.dst.u.all; if (port != keys->ports.dst) { port = keys->ports.dst; upd = true; } } return upd; #else return false; #endif } /* Cake has several subtle multiple bit settings. In these cases you * would be matching triple isolate mode as well. */ static bool cake_dsrc(int flow_mode) { return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC; } static bool cake_ddst(int flow_mode) { return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST; } static void cake_dec_srchost_bulk_flow_count(struct cake_tin_data *q, struct cake_flow *flow, int flow_mode) { if (likely(cake_dsrc(flow_mode) && q->hosts[flow->srchost].srchost_bulk_flow_count)) q->hosts[flow->srchost].srchost_bulk_flow_count--; } static void cake_inc_srchost_bulk_flow_count(struct cake_tin_data *q, struct cake_flow *flow, int flow_mode) { if (likely(cake_dsrc(flow_mode) && q->hosts[flow->srchost].srchost_bulk_flow_count < CAKE_QUEUES)) q->hosts[flow->srchost].srchost_bulk_flow_count++; } static void cake_dec_dsthost_bulk_flow_count(struct cake_tin_data *q, struct cake_flow *flow, int flow_mode) { if (likely(cake_ddst(flow_mode) && q->hosts[flow->dsthost].dsthost_bulk_flow_count)) q->hosts[flow->dsthost].dsthost_bulk_flow_count--; } static void cake_inc_dsthost_bulk_flow_count(struct cake_tin_data *q, struct cake_flow *flow, int flow_mode) { if (likely(cake_ddst(flow_mode) && q->hosts[flow->dsthost].dsthost_bulk_flow_count < CAKE_QUEUES)) q->hosts[flow->dsthost].dsthost_bulk_flow_count++; } static u16 cake_get_flow_quantum(struct cake_tin_data *q, struct cake_flow *flow, int flow_mode) { u16 host_load = 1; if (cake_dsrc(flow_mode)) host_load = max(host_load, q->hosts[flow->srchost].srchost_bulk_flow_count); if (cake_ddst(flow_mode)) host_load = max(host_load, q->hosts[flow->dsthost].dsthost_bulk_flow_count); /* The get_random_u16() is a way to apply dithering to avoid * accumulating roundoff errors */ return (q->flow_quantum * quantum_div[host_load] + get_random_u16()) >> 16; } static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb, int flow_mode, u16 flow_override, u16 host_override) { bool hash_flows = (!flow_override && !!(flow_mode & CAKE_FLOW_FLOWS)); bool hash_hosts = (!host_override && !!(flow_mode & CAKE_FLOW_HOSTS)); bool nat_enabled = !!(flow_mode & CAKE_FLOW_NAT_FLAG); u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0; u16 reduced_hash, srchost_idx, dsthost_idx; struct flow_keys keys, host_keys; bool use_skbhash = skb->l4_hash; if (unlikely(flow_mode == CAKE_FLOW_NONE)) return 0; /* If both overrides are set, or we can use the SKB hash and nat mode is * disabled, we can skip packet dissection entirely. If nat mode is * enabled there's another check below after doing the conntrack lookup. */ if ((!hash_flows || (use_skbhash && !nat_enabled)) && !hash_hosts) goto skip_hash; skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); /* Don't use the SKB hash if we change the lookup keys from conntrack */ if (nat_enabled && cake_update_flowkeys(&keys, skb)) use_skbhash = false; /* If we can still use the SKB hash and don't need the host hash, we can * skip the rest of the hashing procedure */ if (use_skbhash && !hash_hosts) goto skip_hash; /* flow_hash_from_keys() sorts the addresses by value, so we have * to preserve their order in a separate data structure to treat * src and dst host addresses as independently selectable. */ host_keys = keys; host_keys.ports.ports = 0; host_keys.basic.ip_proto = 0; host_keys.keyid.keyid = 0; host_keys.tags.flow_label = 0; switch (host_keys.control.addr_type) { case FLOW_DISSECTOR_KEY_IPV4_ADDRS: host_keys.addrs.v4addrs.src = 0; dsthost_hash = flow_hash_from_keys(&host_keys); host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; host_keys.addrs.v4addrs.dst = 0; srchost_hash = flow_hash_from_keys(&host_keys); break; case FLOW_DISSECTOR_KEY_IPV6_ADDRS: memset(&host_keys.addrs.v6addrs.src, 0, sizeof(host_keys.addrs.v6addrs.src)); dsthost_hash = flow_hash_from_keys(&host_keys); host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; memset(&host_keys.addrs.v6addrs.dst, 0, sizeof(host_keys.addrs.v6addrs.dst)); srchost_hash = flow_hash_from_keys(&host_keys); break; default: dsthost_hash = 0; srchost_hash = 0; } /* This *must* be after the above switch, since as a * side-effect it sorts the src and dst addresses. */ if (hash_flows && !use_skbhash) flow_hash = flow_hash_from_keys(&keys); skip_hash: if (flow_override) flow_hash = flow_override - 1; else if (use_skbhash && (flow_mode & CAKE_FLOW_FLOWS)) flow_hash = skb->hash; if (host_override) { dsthost_hash = host_override - 1; srchost_hash = host_override - 1; } if (!(flow_mode & CAKE_FLOW_FLOWS)) { if (flow_mode & CAKE_FLOW_SRC_IP) flow_hash ^= srchost_hash; if (flow_mode & CAKE_FLOW_DST_IP) flow_hash ^= dsthost_hash; } reduced_hash = flow_hash % CAKE_QUEUES; /* set-associative hashing */ /* fast path if no hash collision (direct lookup succeeds) */ if (likely(q->tags[reduced_hash] == flow_hash && q->flows[reduced_hash].set)) { q->way_directs++; } else { u32 inner_hash = reduced_hash % CAKE_SET_WAYS; u32 outer_hash = reduced_hash - inner_hash; bool allocate_src = false; bool allocate_dst = false; u32 i, k; /* check if any active queue in the set is reserved for * this flow. */ for (i = 0, k = inner_hash; i < CAKE_SET_WAYS; i++, k = (k + 1) % CAKE_SET_WAYS) { if (q->tags[outer_hash + k] == flow_hash) { if (i) q->way_hits++; if (!q->flows[outer_hash + k].set) { /* need to increment host refcnts */ allocate_src = cake_dsrc(flow_mode); allocate_dst = cake_ddst(flow_mode); } goto found; } } /* no queue is reserved for this flow, look for an * empty one. */ for (i = 0; i < CAKE_SET_WAYS; i++, k = (k + 1) % CAKE_SET_WAYS) { if (!q->flows[outer_hash + k].set) { q->way_misses++; allocate_src = cake_dsrc(flow_mode); allocate_dst = cake_ddst(flow_mode); goto found; } } /* With no empty queues, default to the original * queue, accept the collision, update the host tags. */ q->way_collisions++; allocate_src = cake_dsrc(flow_mode); allocate_dst = cake_ddst(flow_mode); if (q->flows[outer_hash + k].set == CAKE_SET_BULK) { cake_dec_srchost_bulk_flow_count(q, &q->flows[outer_hash + k], flow_mode); cake_dec_dsthost_bulk_flow_count(q, &q->flows[outer_hash + k], flow_mode); } found: /* reserve queue for future packets in same flow */ reduced_hash = outer_hash + k; q->tags[reduced_hash] = flow_hash; if (allocate_src) { srchost_idx = srchost_hash % CAKE_QUEUES; inner_hash = srchost_idx % CAKE_SET_WAYS; outer_hash = srchost_idx - inner_hash; for (i = 0, k = inner_hash; i < CAKE_SET_WAYS; i++, k = (k + 1) % CAKE_SET_WAYS) { if (q->hosts[outer_hash + k].srchost_tag == srchost_hash) goto found_src; } for (i = 0; i < CAKE_SET_WAYS; i++, k = (k + 1) % CAKE_SET_WAYS) { if (!q->hosts[outer_hash + k].srchost_bulk_flow_count) break; } q->hosts[outer_hash + k].srchost_tag = srchost_hash; found_src: srchost_idx = outer_hash + k; q->flows[reduced_hash].srchost = srchost_idx; if (q->flows[reduced_hash].set == CAKE_SET_BULK) cake_inc_srchost_bulk_flow_count(q, &q->flows[reduced_hash], flow_mode); } if (allocate_dst) { dsthost_idx = dsthost_hash % CAKE_QUEUES; inner_hash = dsthost_idx % CAKE_SET_WAYS; outer_hash = dsthost_idx - inner_hash; for (i = 0, k = inner_hash; i < CAKE_SET_WAYS; i++, k = (k + 1) % CAKE_SET_WAYS) { if (q->hosts[outer_hash + k].dsthost_tag == dsthost_hash) goto found_dst; } for (i = 0; i < CAKE_SET_WAYS; i++, k = (k + 1) % CAKE_SET_WAYS) { if (!q->hosts[outer_hash + k].dsthost_bulk_flow_count) break; } q->hosts[outer_hash + k].dsthost_tag = dsthost_hash; found_dst: dsthost_idx = outer_hash + k; q->flows[reduced_hash].dsthost = dsthost_idx; if (q->flows[reduced_hash].set == CAKE_SET_BULK) cake_inc_dsthost_bulk_flow_count(q, &q->flows[reduced_hash], flow_mode); } } return reduced_hash; } /* helper functions : might be changed when/if skb use a standard list_head */ /* remove one skb from head of slot queue */ static struct sk_buff *dequeue_head(struct cake_flow *flow) { struct sk_buff *skb = flow->head; if (skb) { flow->head = skb->next; skb_mark_not_on_list(skb); } return skb; } /* add skb to flow queue (tail add) */ static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb) { if (!flow->head) flow->head = skb; else flow->tail->next = skb; flow->tail = skb; skb->next = NULL; } static struct iphdr *cake_get_iphdr(const struct sk_buff *skb, struct ipv6hdr *buf) { unsigned int offset = skb_network_offset(skb); struct iphdr *iph; iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf); if (!iph) return NULL; if (iph->version == 4 && iph->protocol == IPPROTO_IPV6) return skb_header_pointer(skb, offset + iph->ihl * 4, sizeof(struct ipv6hdr), buf); else if (iph->version == 4) return iph; else if (iph->version == 6) return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr), buf); return NULL; } static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb, void *buf, unsigned int bufsize) { unsigned int offset = skb_network_offset(skb); const struct ipv6hdr *ipv6h; const struct tcphdr *tcph; const struct iphdr *iph; struct ipv6hdr _ipv6h; struct tcphdr _tcph; ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h); if (!ipv6h) return NULL; if (ipv6h->version == 4) { iph = (struct iphdr *)ipv6h; offset += iph->ihl * 4; /* special-case 6in4 tunnelling, as that is a common way to get * v6 connectivity in the home */ if (iph->protocol == IPPROTO_IPV6) { ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h); if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP) return NULL; offset += sizeof(struct ipv6hdr); } else if (iph->protocol != IPPROTO_TCP) { return NULL; } } else if (ipv6h->version == 6) { if (ipv6h->nexthdr != IPPROTO_TCP) return NULL; offset += sizeof(struct ipv6hdr); } else { return NULL; } tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph); if (!tcph || tcph->doff < 5) return NULL; return skb_header_pointer(skb, offset, min(__tcp_hdrlen(tcph), bufsize), buf); } static const void *cake_get_tcpopt(const struct tcphdr *tcph, int code, int *oplen) { /* inspired by tcp_parse_options in tcp_input.c */ int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr); const u8 *ptr = (const u8 *)(tcph + 1); while (length > 0) { int opcode = *ptr++; int opsize; if (opcode == TCPOPT_EOL) break; if (opcode == TCPOPT_NOP) { length--; continue; } if (length < 2) break; opsize = *ptr++; if (opsize < 2 || opsize > length) break; if (opcode == code) { *oplen = opsize; return ptr; } ptr += opsize - 2; length -= opsize; } return NULL; } /* Compare two SACK sequences. A sequence is considered greater if it SACKs more * bytes than the other. In the case where both sequences ACKs bytes that the * other doesn't, A is considered greater. DSACKs in A also makes A be * considered greater. * * @return -1, 0 or 1 as normal compare functions */ static int cake_tcph_sack_compare(const struct tcphdr *tcph_a, const struct tcphdr *tcph_b) { const struct tcp_sack_block_wire *sack_a, *sack_b; u32 ack_seq_a = ntohl(tcph_a->ack_seq); u32 bytes_a = 0, bytes_b = 0; int oplen_a, oplen_b; bool first = true; sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a); sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b); /* pointers point to option contents */ oplen_a -= TCPOLEN_SACK_BASE; oplen_b -= TCPOLEN_SACK_BASE; if (sack_a && oplen_a >= sizeof(*sack_a) && (!sack_b || oplen_b < sizeof(*sack_b))) return -1; else if (sack_b && oplen_b >= sizeof(*sack_b) && (!sack_a || oplen_a < sizeof(*sack_a))) return 1; else if ((!sack_a || oplen_a < sizeof(*sack_a)) && (!sack_b || oplen_b < sizeof(*sack_b))) return 0; while (oplen_a >= sizeof(*sack_a)) { const struct tcp_sack_block_wire *sack_tmp = sack_b; u32 start_a = get_unaligned_be32(&sack_a->start_seq); u32 end_a = get_unaligned_be32(&sack_a->end_seq); int oplen_tmp = oplen_b; bool found = false; /* DSACK; always considered greater to prevent dropping */ if (before(start_a, ack_seq_a)) return -1; bytes_a += end_a - start_a; while (oplen_tmp >= sizeof(*sack_tmp)) { u32 start_b = get_unaligned_be32(&sack_tmp->start_seq); u32 end_b = get_unaligned_be32(&sack_tmp->end_seq); /* first time through we count the total size */ if (first) bytes_b += end_b - start_b; if (!after(start_b, start_a) && !before(end_b, end_a)) { found = true; if (!first) break; } oplen_tmp -= sizeof(*sack_tmp); sack_tmp++; } if (!found) return -1; oplen_a -= sizeof(*sack_a); sack_a++; first = false; } /* If we made it this far, all ranges SACKed by A are covered by B, so * either the SACKs are equal, or B SACKs more bytes. */ return bytes_b > bytes_a ? 1 : 0; } static void cake_tcph_get_tstamp(const struct tcphdr *tcph, u32 *tsval, u32 *tsecr) { const u8 *ptr; int opsize; ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize); if (ptr && opsize == TCPOLEN_TIMESTAMP) { *tsval = get_unaligned_be32(ptr); *tsecr = get_unaligned_be32(ptr + 4); } } static bool cake_tcph_may_drop(const struct tcphdr *tcph, u32 tstamp_new, u32 tsecr_new) { /* inspired by tcp_parse_options in tcp_input.c */ int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr); const u8 *ptr = (const u8 *)(tcph + 1); u32 tstamp, tsecr; /* 3 reserved flags must be unset to avoid future breakage * ACK must be set * ECE/CWR are handled separately * All other flags URG/PSH/RST/SYN/FIN must be unset * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero) * 0x00C00000 = CWR/ECE (handled separately) * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000 */ if (((tcp_flag_word(tcph) & cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK)) return false; while (length > 0) { int opcode = *ptr++; int opsize; if (opcode == TCPOPT_EOL) break; if (opcode == TCPOPT_NOP) { length--; continue; } if (length < 2) break; opsize = *ptr++; if (opsize < 2 || opsize > length) break; switch (opcode) { case TCPOPT_MD5SIG: /* doesn't influence state */ break; case TCPOPT_SACK: /* stricter checking performed later */ if (opsize % 8 != 2) return false; break; case TCPOPT_TIMESTAMP: /* only drop timestamps lower than new */ if (opsize != TCPOLEN_TIMESTAMP) return false; tstamp = get_unaligned_be32(ptr); tsecr = get_unaligned_be32(ptr + 4); if (after(tstamp, tstamp_new) || after(tsecr, tsecr_new)) return false; break; case TCPOPT_MSS: /* these should only be set on SYN */ case TCPOPT_WINDOW: case TCPOPT_SACK_PERM: case TCPOPT_FASTOPEN: case TCPOPT_EXP: default: /* don't drop if any unknown options are present */ return false; } ptr += opsize - 2; length -= opsize; } return true; } static struct sk_buff *cake_ack_filter(struct cake_sched_data *q, struct cake_flow *flow) { bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE; struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL; struct sk_buff *skb_check, *skb_prev = NULL; const struct ipv6hdr *ipv6h, *ipv6h_check; unsigned char _tcph[64], _tcph_check[64]; const struct tcphdr *tcph, *tcph_check; const struct iphdr *iph, *iph_check; struct ipv6hdr _iph, _iph_check; const struct sk_buff *skb; int seglen, num_found = 0; u32 tstamp = 0, tsecr = 0; __be32 elig_flags = 0; int sack_comp; /* no other possible ACKs to filter */ if (flow->head == flow->tail) return NULL; skb = flow->tail; tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph)); iph = cake_get_iphdr(skb, &_iph); if (!tcph) return NULL; cake_tcph_get_tstamp(tcph, &tstamp, &tsecr); /* the 'triggering' packet need only have the ACK flag set. * also check that SYN is not set, as there won't be any previous ACKs. */ if ((tcp_flag_word(tcph) & (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK) return NULL; /* the 'triggering' ACK is at the tail of the queue, we have already * returned if it is the only packet in the flow. loop through the rest * of the queue looking for pure ACKs with the same 5-tuple as the * triggering one. */ for (skb_check = flow->head; skb_check && skb_check != skb; skb_prev = skb_check, skb_check = skb_check->next) { iph_check = cake_get_iphdr(skb_check, &_iph_check); tcph_check = cake_get_tcphdr(skb_check, &_tcph_check, sizeof(_tcph_check)); /* only TCP packets with matching 5-tuple are eligible, and only * drop safe headers */ if (!tcph_check || iph->version != iph_check->version || tcph_check->source != tcph->source || tcph_check->dest != tcph->dest) continue; if (iph_check->version == 4) { if (iph_check->saddr != iph->saddr || iph_check->daddr != iph->daddr) continue; seglen = iph_totlen(skb, iph_check) - (4 * iph_check->ihl); } else if (iph_check->version == 6) { ipv6h = (struct ipv6hdr *)iph; ipv6h_check = (struct ipv6hdr *)iph_check; if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) || ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr)) continue; seglen = ntohs(ipv6h_check->payload_len); } else { WARN_ON(1); /* shouldn't happen */ continue; } /* If the ECE/CWR flags changed from the previous eligible * packet in the same flow, we should no longer be dropping that * previous packet as this would lose information. */ if (elig_ack && (tcp_flag_word(tcph_check) & (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) { elig_ack = NULL; elig_ack_prev = NULL; num_found--; } /* Check TCP options and flags, don't drop ACKs with segment * data, and don't drop ACKs with a higher cumulative ACK * counter than the triggering packet. Check ACK seqno here to * avoid parsing SACK options of packets we are going to exclude * anyway. */ if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) || (seglen - __tcp_hdrlen(tcph_check)) != 0 || after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq))) continue; /* Check SACK options. The triggering packet must SACK more data * than the ACK under consideration, or SACK the same range but * have a larger cumulative ACK counter. The latter is a * pathological case, but is contained in the following check * anyway, just to be safe. */ sack_comp = cake_tcph_sack_compare(tcph_check, tcph); if (sack_comp < 0 || (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) && sack_comp == 0)) continue; /* At this point we have found an eligible pure ACK to drop; if * we are in aggressive mode, we are done. Otherwise, keep * searching unless this is the second eligible ACK we * found. * * Since we want to drop ACK closest to the head of the queue, * save the first eligible ACK we find, even if we need to loop * again. */ if (!elig_ack) { elig_ack = skb_check; elig_ack_prev = skb_prev; elig_flags = (tcp_flag_word(tcph_check) & (TCP_FLAG_ECE | TCP_FLAG_CWR)); } if (num_found++ > 0) goto found; } /* We made it through the queue without finding two eligible ACKs . If * we found a single eligible ACK we can drop it in aggressive mode if * we can guarantee that this does not interfere with ECN flag * information. We ensure this by dropping it only if the enqueued * packet is consecutive with the eligible ACK, and their flags match. */ if (elig_ack && aggressive && elig_ack->next == skb && (elig_flags == (tcp_flag_word(tcph) & (TCP_FLAG_ECE | TCP_FLAG_CWR)))) goto found; return NULL; found: if (elig_ack_prev) elig_ack_prev->next = elig_ack->next; else flow->head = elig_ack->next; skb_mark_not_on_list(elig_ack); return elig_ack; } static u64 cake_ewma(u64 avg, u64 sample, u32 shift) { avg -= avg >> shift; avg += sample >> shift; return avg; } static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off) { if (q->rate_flags & CAKE_FLAG_OVERHEAD) len -= off; if (q->max_netlen < len) q->max_netlen = len; if (q->min_netlen > len) q->min_netlen = len; len += q->rate_overhead; if (len < q->rate_mpu) len = q->rate_mpu; if (q->atm_mode == CAKE_ATM_ATM) { len += 47; len /= 48; len *= 53; } else if (q->atm_mode == CAKE_ATM_PTM) { /* Add one byte per 64 bytes or part thereof. * This is conservative and easier to calculate than the * precise value. */ len += (len + 63) / 64; } if (q->max_adjlen < len) q->max_adjlen = len; if (q->min_adjlen > len) q->min_adjlen = len; return len; } static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb) { const struct skb_shared_info *shinfo = skb_shinfo(skb); unsigned int hdr_len, last_len = 0; u32 off = skb_network_offset(skb); u32 len = qdisc_pkt_len(skb); u16 segs = 1; q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8); if (!shinfo->gso_size) return cake_calc_overhead(q, len, off); /* borrowed from qdisc_pkt_len_init() */ if (!skb->encapsulation) hdr_len = skb_transport_offset(skb); else hdr_len = skb_inner_transport_offset(skb); /* + transport layer */ if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { const struct tcphdr *th; struct tcphdr _tcphdr; th = skb_header_pointer(skb, hdr_len, sizeof(_tcphdr), &_tcphdr); if (likely(th)) hdr_len += __tcp_hdrlen(th); } else { struct udphdr _udphdr; if (skb_header_pointer(skb, hdr_len, sizeof(_udphdr), &_udphdr)) hdr_len += sizeof(struct udphdr); } if (unlikely(shinfo->gso_type & SKB_GSO_DODGY)) segs = DIV_ROUND_UP(skb->len - hdr_len, shinfo->gso_size); else segs = shinfo->gso_segs; len = shinfo->gso_size + hdr_len; last_len = skb->len - shinfo->gso_size * (segs - 1); return (cake_calc_overhead(q, len, off) * (segs - 1) + cake_calc_overhead(q, last_len, off)); } static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j) { struct cake_heap_entry ii = q->overflow_heap[i]; struct cake_heap_entry jj = q->overflow_heap[j]; q->overflow_heap[i] = jj; q->overflow_heap[j] = ii; q->tins[ii.t].overflow_idx[ii.b] = j; q->tins[jj.t].overflow_idx[jj.b] = i; } static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i) { struct cake_heap_entry ii = q->overflow_heap[i]; return q->tins[ii.t].backlogs[ii.b]; } static void cake_heapify(struct cake_sched_data *q, u16 i) { static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES; u32 mb = cake_heap_get_backlog(q, i); u32 m = i; while (m < a) { u32 l = m + m + 1; u32 r = l + 1; if (l < a) { u32 lb = cake_heap_get_backlog(q, l); if (lb > mb) { m = l; mb = lb; } } if (r < a) { u32 rb = cake_heap_get_backlog(q, r); if (rb > mb) { m = r; mb = rb; } } if (m != i) { cake_heap_swap(q, i, m); i = m; } else { break; } } } static void cake_heapify_up(struct cake_sched_data *q, u16 i) { while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) { u16 p = (i - 1) >> 1; u32 ib = cake_heap_get_backlog(q, i); u32 pb = cake_heap_get_backlog(q, p); if (ib > pb) { cake_heap_swap(q, i, p); i = p; } else { break; } } } static int cake_advance_shaper(struct cake_sched_data *q, struct cake_tin_data *b, struct sk_buff *skb, ktime_t now, bool drop) { u32 len = get_cobalt_cb(skb)->adjusted_len; /* charge packet bandwidth to this tin * and to the global shaper. */ if (q->rate_ns) { u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft; u64 global_dur = (len * q->rate_ns) >> q->rate_shft; u64 failsafe_dur = global_dur + (global_dur >> 1); if (ktime_before(b->time_next_packet, now)) b->time_next_packet = ktime_add_ns(b->time_next_packet, tin_dur); else if (ktime_before(b->time_next_packet, ktime_add_ns(now, tin_dur))) b->time_next_packet = ktime_add_ns(now, tin_dur); q->time_next_packet = ktime_add_ns(q->time_next_packet, global_dur); if (!drop) q->failsafe_next_packet = \ ktime_add_ns(q->failsafe_next_packet, failsafe_dur); } return len; } static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free) { struct cake_sched_data *q = qdisc_priv(sch); ktime_t now = ktime_get(); u32 idx = 0, tin = 0, len; struct cake_heap_entry qq; struct cake_tin_data *b; struct cake_flow *flow; struct sk_buff *skb; if (!q->overflow_timeout) { int i; /* Build fresh max-heap */ for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2 - 1; i >= 0; i--) cake_heapify(q, i); } q->overflow_timeout = 65535; /* select longest queue for pruning */ qq = q->overflow_heap[0]; tin = qq.t; idx = qq.b; b = &q->tins[tin]; flow = &b->flows[idx]; skb = dequeue_head(flow); if (unlikely(!skb)) { /* heap has gone wrong, rebuild it next time */ q->overflow_timeout = 0; return idx + (tin << 16); } if (cobalt_queue_full(&flow->cvars, &b->cparams, now)) b->unresponsive_flow_count++; len = qdisc_pkt_len(skb); q->buffer_used -= skb->truesize; b->backlogs[idx] -= len; b->tin_backlog -= len; sch->qstats.backlog -= len; flow->dropped++; b->tin_dropped++; if (q->rate_flags & CAKE_FLAG_INGRESS) cake_advance_shaper(q, b, skb, now, true); qdisc_drop_reason(skb, sch, to_free, SKB_DROP_REASON_QDISC_OVERLIMIT); sch->q.qlen--; qdisc_tree_reduce_backlog(sch, 1, len); cake_heapify(q, 0); return idx + (tin << 16); } static u8 cake_handle_diffserv(struct sk_buff *skb, bool wash) { const int offset = skb_network_offset(skb); u16 *buf, buf_; u8 dscp; switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_); if (unlikely(!buf)) return 0; /* ToS is in the second byte of iphdr */ dscp = ipv4_get_dsfield((struct iphdr *)buf) >> 2; if (wash && dscp) { const int wlen = offset + sizeof(struct iphdr); if (!pskb_may_pull(skb, wlen) || skb_try_make_writable(skb, wlen)) return 0; ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0); } return dscp; case htons(ETH_P_IPV6): buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_); if (unlikely(!buf)) return 0; /* Traffic class is in the first and second bytes of ipv6hdr */ dscp = ipv6_get_dsfield((struct ipv6hdr *)buf) >> 2; if (wash && dscp) { const int wlen = offset + sizeof(struct ipv6hdr); if (!pskb_may_pull(skb, wlen) || skb_try_make_writable(skb, wlen)) return 0; ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0); } return dscp; case htons(ETH_P_ARP): return 0x38; /* CS7 - Net Control */ default: /* If there is no Diffserv field, treat as best-effort */ return 0; } } static struct cake_tin_data *cake_select_tin(struct Qdisc *sch, struct sk_buff *skb) { struct cake_sched_data *q = qdisc_priv(sch); u32 tin, mark; bool wash; u8 dscp; /* Tin selection: Default to diffserv-based selection, allow overriding * using firewall marks or skb->priority. Call DSCP parsing early if * wash is enabled, otherwise defer to below to skip unneeded parsing. */ mark = (skb->mark & q->fwmark_mask) >> q->fwmark_shft; wash = !!(q->rate_flags & CAKE_FLAG_WASH); if (wash) dscp = cake_handle_diffserv(skb, wash); if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT) tin = 0; else if (mark && mark <= q->tin_cnt) tin = q->tin_order[mark - 1]; else if (TC_H_MAJ(skb->priority) == sch->handle && TC_H_MIN(skb->priority) > 0 && TC_H_MIN(skb->priority) <= q->tin_cnt) tin = q->tin_order[TC_H_MIN(skb->priority) - 1]; else { if (!wash) dscp = cake_handle_diffserv(skb, wash); tin = q->tin_index[dscp]; if (unlikely(tin >= q->tin_cnt)) tin = 0; } return &q->tins[tin]; } static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t, struct sk_buff *skb, int flow_mode, int *qerr) { struct cake_sched_data *q = qdisc_priv(sch); struct tcf_proto *filter; struct tcf_result res; u16 flow = 0, host = 0; int result; filter = rcu_dereference_bh(q->filter_list); if (!filter) goto hash; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; result = tcf_classify(skb, NULL, filter, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { case TC_ACT_STOLEN: case TC_ACT_QUEUED: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; fallthrough; case TC_ACT_SHOT: return 0; } #endif if (TC_H_MIN(res.classid) <= CAKE_QUEUES) flow = TC_H_MIN(res.classid); if (TC_H_MAJ(res.classid) <= (CAKE_QUEUES << 16)) host = TC_H_MAJ(res.classid) >> 16; } hash: *t = cake_select_tin(sch, skb); return cake_hash(*t, skb, flow_mode, flow, host) + 1; } static void cake_reconfigure(struct Qdisc *sch); static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct cake_sched_data *q = qdisc_priv(sch); int len = qdisc_pkt_len(skb); int ret; struct sk_buff *ack = NULL; ktime_t now = ktime_get(); struct cake_tin_data *b; struct cake_flow *flow; u32 idx, tin; /* choose flow to insert into */ idx = cake_classify(sch, &b, skb, q->flow_mode, &ret); if (idx == 0) { if (ret & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); __qdisc_drop(skb, to_free); return ret; } tin = (u32)(b - q->tins); idx--; flow = &b->flows[idx]; /* ensure shaper state isn't stale */ if (!b->tin_backlog) { if (ktime_before(b->time_next_packet, now)) b->time_next_packet = now; if (!sch->q.qlen) { if (ktime_before(q->time_next_packet, now)) { q->failsafe_next_packet = now; q->time_next_packet = now; } else if (ktime_after(q->time_next_packet, now) && ktime_after(q->failsafe_next_packet, now)) { u64 next = \ min(ktime_to_ns(q->time_next_packet), ktime_to_ns( q->failsafe_next_packet)); sch->qstats.overlimits++; qdisc_watchdog_schedule_ns(&q->watchdog, next); } } } if (unlikely(len > b->max_skblen)) b->max_skblen = len; if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) { struct sk_buff *segs, *nskb; netdev_features_t features = netif_skb_features(skb); unsigned int slen = 0, numsegs = 0; segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); if (IS_ERR_OR_NULL(segs)) return qdisc_drop(skb, sch, to_free); skb_list_walk_safe(segs, segs, nskb) { skb_mark_not_on_list(segs); qdisc_skb_cb(segs)->pkt_len = segs->len; cobalt_set_enqueue_time(segs, now); get_cobalt_cb(segs)->adjusted_len = cake_overhead(q, segs); flow_queue_add(flow, segs); sch->q.qlen++; numsegs++; slen += segs->len; q->buffer_used += segs->truesize; b->packets++; } /* stats */ b->bytes += slen; b->backlogs[idx] += slen; b->tin_backlog += slen; sch->qstats.backlog += slen; q->avg_window_bytes += slen; qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen); consume_skb(skb); } else { /* not splitting */ cobalt_set_enqueue_time(skb, now); get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb); flow_queue_add(flow, skb); if (q->ack_filter) ack = cake_ack_filter(q, flow); if (ack) { b->ack_drops++; sch->qstats.drops++; b->bytes += qdisc_pkt_len(ack); len -= qdisc_pkt_len(ack); q->buffer_used += skb->truesize - ack->truesize; if (q->rate_flags & CAKE_FLAG_INGRESS) cake_advance_shaper(q, b, ack, now, true); qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack)); consume_skb(ack); } else { sch->q.qlen++; q->buffer_used += skb->truesize; } /* stats */ b->packets++; b->bytes += len; b->backlogs[idx] += len; b->tin_backlog += len; sch->qstats.backlog += len; q->avg_window_bytes += len; } if (q->overflow_timeout) cake_heapify_up(q, b->overflow_idx[idx]); /* incoming bandwidth capacity estimate */ if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) { u64 packet_interval = \ ktime_to_ns(ktime_sub(now, q->last_packet_time)); if (packet_interval > NSEC_PER_SEC) packet_interval = NSEC_PER_SEC; /* filter out short-term bursts, eg. wifi aggregation */ q->avg_packet_interval = \ cake_ewma(q->avg_packet_interval, packet_interval, (packet_interval > q->avg_packet_interval ? 2 : 8)); q->last_packet_time = now; if (packet_interval > q->avg_packet_interval) { u64 window_interval = \ ktime_to_ns(ktime_sub(now, q->avg_window_begin)); u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC; b = div64_u64(b, window_interval); q->avg_peak_bandwidth = cake_ewma(q->avg_peak_bandwidth, b, b > q->avg_peak_bandwidth ? 2 : 8); q->avg_window_bytes = 0; q->avg_window_begin = now; if (ktime_after(now, ktime_add_ms(q->last_reconfig_time, 250))) { q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4; cake_reconfigure(sch); } } } else { q->avg_window_bytes = 0; q->last_packet_time = now; } /* flowchain */ if (!flow->set || flow->set == CAKE_SET_DECAYING) { if (!flow->set) { list_add_tail(&flow->flowchain, &b->new_flows); } else { b->decaying_flow_count--; list_move_tail(&flow->flowchain, &b->new_flows); } flow->set = CAKE_SET_SPARSE; b->sparse_flow_count++; flow->deficit = cake_get_flow_quantum(b, flow, q->flow_mode); } else if (flow->set == CAKE_SET_SPARSE_WAIT) { /* this flow was empty, accounted as a sparse flow, but actually * in the bulk rotation. */ flow->set = CAKE_SET_BULK; b->sparse_flow_count--; b->bulk_flow_count++; cake_inc_srchost_bulk_flow_count(b, flow, q->flow_mode); cake_inc_dsthost_bulk_flow_count(b, flow, q->flow_mode); } if (q->buffer_used > q->buffer_max_used) q->buffer_max_used = q->buffer_used; if (q->buffer_used > q->buffer_limit) { bool same_flow = false; u32 dropped = 0; u32 drop_id; while (q->buffer_used > q->buffer_limit) { dropped++; drop_id = cake_drop(sch, to_free); if ((drop_id >> 16) == tin && (drop_id & 0xFFFF) == idx) same_flow = true; } b->drop_overlimit += dropped; if (same_flow) return NET_XMIT_CN; } return NET_XMIT_SUCCESS; } static struct sk_buff *cake_dequeue_one(struct Qdisc *sch) { struct cake_sched_data *q = qdisc_priv(sch); struct cake_tin_data *b = &q->tins[q->cur_tin]; struct cake_flow *flow = &b->flows[q->cur_flow]; struct sk_buff *skb = NULL; u32 len; if (flow->head) { skb = dequeue_head(flow); len = qdisc_pkt_len(skb); b->backlogs[q->cur_flow] -= len; b->tin_backlog -= len; sch->qstats.backlog -= len; q->buffer_used -= skb->truesize; sch->q.qlen--; if (q->overflow_timeout) cake_heapify(q, b->overflow_idx[q->cur_flow]); } return skb; } /* Discard leftover packets from a tin no longer in use. */ static void cake_clear_tin(struct Qdisc *sch, u16 tin) { struct cake_sched_data *q = qdisc_priv(sch); struct sk_buff *skb; q->cur_tin = tin; for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++) while (!!(skb = cake_dequeue_one(sch))) kfree_skb_reason(skb, SKB_DROP_REASON_QUEUE_PURGE); } static struct sk_buff *cake_dequeue(struct Qdisc *sch) { struct cake_sched_data *q = qdisc_priv(sch); struct cake_tin_data *b = &q->tins[q->cur_tin]; enum skb_drop_reason reason; ktime_t now = ktime_get(); struct cake_flow *flow; struct list_head *head; bool first_flow = true; struct sk_buff *skb; u64 delay; u32 len; begin: if (!sch->q.qlen) return NULL; /* global hard shaper */ if (ktime_after(q->time_next_packet, now) && ktime_after(q->failsafe_next_packet, now)) { u64 next = min(ktime_to_ns(q->time_next_packet), ktime_to_ns(q->failsafe_next_packet)); sch->qstats.overlimits++; qdisc_watchdog_schedule_ns(&q->watchdog, next); return NULL; } /* Choose a class to work on. */ if (!q->rate_ns) { /* In unlimited mode, can't rely on shaper timings, just balance * with DRR */ bool wrapped = false, empty = true; while (b->tin_deficit < 0 || !(b->sparse_flow_count + b->bulk_flow_count)) { if (b->tin_deficit <= 0) b->tin_deficit += b->tin_quantum; if (b->sparse_flow_count + b->bulk_flow_count) empty = false; q->cur_tin++; b++; if (q->cur_tin >= q->tin_cnt) { q->cur_tin = 0; b = q->tins; if (wrapped) { /* It's possible for q->qlen to be * nonzero when we actually have no * packets anywhere. */ if (empty) return NULL; } else { wrapped = true; } } } } else { /* In shaped mode, choose: * - Highest-priority tin with queue and meeting schedule, or * - The earliest-scheduled tin with queue. */ ktime_t best_time = KTIME_MAX; int tin, best_tin = 0; for (tin = 0; tin < q->tin_cnt; tin++) { b = q->tins + tin; if ((b->sparse_flow_count + b->bulk_flow_count) > 0) { ktime_t time_to_pkt = \ ktime_sub(b->time_next_packet, now); if (ktime_to_ns(time_to_pkt) <= 0 || ktime_compare(time_to_pkt, best_time) <= 0) { best_time = time_to_pkt; best_tin = tin; } } } q->cur_tin = best_tin; b = q->tins + best_tin; /* No point in going further if no packets to deliver. */ if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count))) return NULL; } retry: /* service this class */ head = &b->decaying_flows; if (!first_flow || list_empty(head)) { head = &b->new_flows; if (list_empty(head)) { head = &b->old_flows; if (unlikely(list_empty(head))) { head = &b->decaying_flows; if (unlikely(list_empty(head))) goto begin; } } } flow = list_first_entry(head, struct cake_flow, flowchain); q->cur_flow = flow - b->flows; first_flow = false; /* flow isolation (DRR++) */ if (flow->deficit <= 0) { /* Keep all flows with deficits out of the sparse and decaying * rotations. No non-empty flow can go into the decaying * rotation, so they can't get deficits */ if (flow->set == CAKE_SET_SPARSE) { if (flow->head) { b->sparse_flow_count--; b->bulk_flow_count++; cake_inc_srchost_bulk_flow_count(b, flow, q->flow_mode); cake_inc_dsthost_bulk_flow_count(b, flow, q->flow_mode); flow->set = CAKE_SET_BULK; } else { /* we've moved it to the bulk rotation for * correct deficit accounting but we still want * to count it as a sparse flow, not a bulk one. */ flow->set = CAKE_SET_SPARSE_WAIT; } } flow->deficit += cake_get_flow_quantum(b, flow, q->flow_mode); list_move_tail(&flow->flowchain, &b->old_flows); goto retry; } /* Retrieve a packet via the AQM */ while (1) { skb = cake_dequeue_one(sch); if (!skb) { /* this queue was actually empty */ if (cobalt_queue_empty(&flow->cvars, &b->cparams, now)) b->unresponsive_flow_count--; if (flow->cvars.p_drop || flow->cvars.count || ktime_before(now, flow->cvars.drop_next)) { /* keep in the flowchain until the state has * decayed to rest */ list_move_tail(&flow->flowchain, &b->decaying_flows); if (flow->set == CAKE_SET_BULK) { b->bulk_flow_count--; cake_dec_srchost_bulk_flow_count(b, flow, q->flow_mode); cake_dec_dsthost_bulk_flow_count(b, flow, q->flow_mode); b->decaying_flow_count++; } else if (flow->set == CAKE_SET_SPARSE || flow->set == CAKE_SET_SPARSE_WAIT) { b->sparse_flow_count--; b->decaying_flow_count++; } flow->set = CAKE_SET_DECAYING; } else { /* remove empty queue from the flowchain */ list_del_init(&flow->flowchain); if (flow->set == CAKE_SET_SPARSE || flow->set == CAKE_SET_SPARSE_WAIT) b->sparse_flow_count--; else if (flow->set == CAKE_SET_BULK) { b->bulk_flow_count--; cake_dec_srchost_bulk_flow_count(b, flow, q->flow_mode); cake_dec_dsthost_bulk_flow_count(b, flow, q->flow_mode); } else b->decaying_flow_count--; flow->set = CAKE_SET_NONE; } goto begin; } reason = cobalt_should_drop(&flow->cvars, &b->cparams, now, skb, (b->bulk_flow_count * !!(q->rate_flags & CAKE_FLAG_INGRESS))); /* Last packet in queue may be marked, shouldn't be dropped */ if (reason == SKB_NOT_DROPPED_YET || !flow->head) break; /* drop this packet, get another one */ if (q->rate_flags & CAKE_FLAG_INGRESS) { len = cake_advance_shaper(q, b, skb, now, true); flow->deficit -= len; b->tin_deficit -= len; } flow->dropped++; b->tin_dropped++; qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb)); qdisc_qstats_drop(sch); kfree_skb_reason(skb, reason); if (q->rate_flags & CAKE_FLAG_INGRESS) goto retry; } b->tin_ecn_mark += !!flow->cvars.ecn_marked; qdisc_bstats_update(sch, skb); /* collect delay stats */ delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb))); b->avge_delay = cake_ewma(b->avge_delay, delay, 8); b->peak_delay = cake_ewma(b->peak_delay, delay, delay > b->peak_delay ? 2 : 8); b->base_delay = cake_ewma(b->base_delay, delay, delay < b->base_delay ? 2 : 8); len = cake_advance_shaper(q, b, skb, now, false); flow->deficit -= len; b->tin_deficit -= len; if (ktime_after(q->time_next_packet, now) && sch->q.qlen) { u64 next = min(ktime_to_ns(q->time_next_packet), ktime_to_ns(q->failsafe_next_packet)); qdisc_watchdog_schedule_ns(&q->watchdog, next); } else if (!sch->q.qlen) { int i; for (i = 0; i < q->tin_cnt; i++) { if (q->tins[i].decaying_flow_count) { ktime_t next = \ ktime_add_ns(now, q->tins[i].cparams.target); qdisc_watchdog_schedule_ns(&q->watchdog, ktime_to_ns(next)); break; } } } if (q->overflow_timeout) q->overflow_timeout--; return skb; } static void cake_reset(struct Qdisc *sch) { struct cake_sched_data *q = qdisc_priv(sch); u32 c; if (!q->tins) return; for (c = 0; c < CAKE_MAX_TINS; c++) cake_clear_tin(sch, c); } static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = { [TCA_CAKE_BASE_RATE64] = { .type = NLA_U64 }, [TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 }, [TCA_CAKE_ATM] = { .type = NLA_U32 }, [TCA_CAKE_FLOW_MODE] = { .type = NLA_U32 }, [TCA_CAKE_OVERHEAD] = { .type = NLA_S32 }, [TCA_CAKE_RTT] = { .type = NLA_U32 }, [TCA_CAKE_TARGET] = { .type = NLA_U32 }, [TCA_CAKE_AUTORATE] = { .type = NLA_U32 }, [TCA_CAKE_MEMORY] = { .type = NLA_U32 }, [TCA_CAKE_NAT] = { .type = NLA_U32 }, [TCA_CAKE_RAW] = { .type = NLA_U32 }, [TCA_CAKE_WASH] = { .type = NLA_U32 }, [TCA_CAKE_MPU] = { .type = NLA_U32 }, [TCA_CAKE_INGRESS] = { .type = NLA_U32 }, [TCA_CAKE_ACK_FILTER] = { .type = NLA_U32 }, [TCA_CAKE_SPLIT_GSO] = { .type = NLA_U32 }, [TCA_CAKE_FWMARK] = { .type = NLA_U32 }, }; static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu, u64 target_ns, u64 rtt_est_ns) { /* convert byte-rate into time-per-byte * so it will always unwedge in reasonable time. */ static const u64 MIN_RATE = 64; u32 byte_target = mtu; u64 byte_target_ns; u8 rate_shft = 0; u64 rate_ns = 0; b->flow_quantum = 1514; if (rate) { b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL); rate_shft = 34; rate_ns = ((u64)NSEC_PER_SEC) << rate_shft; rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate)); while (!!(rate_ns >> 34)) { rate_ns >>= 1; rate_shft--; } } /* else unlimited, ie. zero delay */ b->tin_rate_bps = rate; b->tin_rate_ns = rate_ns; b->tin_rate_shft = rate_shft; byte_target_ns = (byte_target * rate_ns) >> rate_shft; b->cparams.target = max((byte_target_ns * 3) / 2, target_ns); b->cparams.interval = max(rtt_est_ns + b->cparams.target - target_ns, b->cparams.target * 2); b->cparams.mtu_time = byte_target_ns; b->cparams.p_inc = 1 << 24; /* 1/256 */ b->cparams.p_dec = 1 << 20; /* 1/4096 */ } static int cake_config_besteffort(struct Qdisc *sch) { struct cake_sched_data *q = qdisc_priv(sch); struct cake_tin_data *b = &q->tins[0]; u32 mtu = psched_mtu(qdisc_dev(sch)); u64 rate = q->rate_bps; q->tin_cnt = 1; q->tin_index = besteffort; q->tin_order = normal_order; cake_set_rate(b, rate, mtu, us_to_ns(q->target), us_to_ns(q->interval)); b->tin_quantum = 65535; return 0; } static int cake_config_precedence(struct Qdisc *sch) { /* convert high-level (user visible) parameters into internal format */ struct cake_sched_data *q = qdisc_priv(sch); u32 mtu = psched_mtu(qdisc_dev(sch)); u64 rate = q->rate_bps; u32 quantum = 256; u32 i; q->tin_cnt = 8; q->tin_index = precedence; q->tin_order = normal_order; for (i = 0; i < q->tin_cnt; i++) { struct cake_tin_data *b = &q->tins[i]; cake_set_rate(b, rate, mtu, us_to_ns(q->target), us_to_ns(q->interval)); b->tin_quantum = max_t(u16, 1U, quantum); /* calculate next class's parameters */ rate *= 7; rate >>= 3; quantum *= 7; quantum >>= 3; } return 0; } /* List of known Diffserv codepoints: * * Default Forwarding (DF/CS0) - Best Effort * Max Throughput (TOS2) * Min Delay (TOS4) * LLT "La" (TOS5) * Assured Forwarding 1 (AF1x) - x3 * Assured Forwarding 2 (AF2x) - x3 * Assured Forwarding 3 (AF3x) - x3 * Assured Forwarding 4 (AF4x) - x3 * Precedence Class 1 (CS1) * Precedence Class 2 (CS2) * Precedence Class 3 (CS3) * Precedence Class 4 (CS4) * Precedence Class 5 (CS5) * Precedence Class 6 (CS6) * Precedence Class 7 (CS7) * Voice Admit (VA) * Expedited Forwarding (EF) * Lower Effort (LE) * * Total 26 codepoints. */ /* List of traffic classes in RFC 4594, updated by RFC 8622: * (roughly descending order of contended priority) * (roughly ascending order of uncontended throughput) * * Network Control (CS6,CS7) - routing traffic * Telephony (EF,VA) - aka. VoIP streams * Signalling (CS5) - VoIP setup * Multimedia Conferencing (AF4x) - aka. video calls * Realtime Interactive (CS4) - eg. games * Multimedia Streaming (AF3x) - eg. YouTube, NetFlix, Twitch * Broadcast Video (CS3) * Low-Latency Data (AF2x,TOS4) - eg. database * Ops, Admin, Management (CS2) - eg. ssh * Standard Service (DF & unrecognised codepoints) * High-Throughput Data (AF1x,TOS2) - eg. web traffic * Low-Priority Data (LE,CS1) - eg. BitTorrent * * Total 12 traffic classes. */ static int cake_config_diffserv8(struct Qdisc *sch) { /* Pruned list of traffic classes for typical applications: * * Network Control (CS6, CS7) * Minimum Latency (EF, VA, CS5, CS4) * Interactive Shell (CS2) * Low Latency Transactions (AF2x, TOS4) * Video Streaming (AF4x, AF3x, CS3) * Bog Standard (DF etc.) * High Throughput (AF1x, TOS2, CS1) * Background Traffic (LE) * * Total 8 traffic classes. */ struct cake_sched_data *q = qdisc_priv(sch); u32 mtu = psched_mtu(qdisc_dev(sch)); u64 rate = q->rate_bps; u32 quantum = 256; u32 i; q->tin_cnt = 8; /* codepoint to class mapping */ q->tin_index = diffserv8; q->tin_order = normal_order; /* class characteristics */ for (i = 0; i < q->tin_cnt; i++) { struct cake_tin_data *b = &q->tins[i]; cake_set_rate(b, rate, mtu, us_to_ns(q->target), us_to_ns(q->interval)); b->tin_quantum = max_t(u16, 1U, quantum); /* calculate next class's parameters */ rate *= 7; rate >>= 3; quantum *= 7; quantum >>= 3; } return 0; } static int cake_config_diffserv4(struct Qdisc *sch) { /* Further pruned list of traffic classes for four-class system: * * Latency Sensitive (CS7, CS6, EF, VA, CS5, CS4) * Streaming Media (AF4x, AF3x, CS3, AF2x, TOS4, CS2) * Best Effort (DF, AF1x, TOS2, and those not specified) * Background Traffic (LE, CS1) * * Total 4 traffic classes. */ struct cake_sched_data *q = qdisc_priv(sch); u32 mtu = psched_mtu(qdisc_dev(sch)); u64 rate = q->rate_bps; u32 quantum = 1024; q->tin_cnt = 4; /* codepoint to class mapping */ q->tin_index = diffserv4; q->tin_order = bulk_order; /* class characteristics */ cake_set_rate(&q->tins[0], rate, mtu, us_to_ns(q->target), us_to_ns(q->interval)); cake_set_rate(&q->tins[1], rate >> 4, mtu, us_to_ns(q->target), us_to_ns(q->interval)); cake_set_rate(&q->tins[2], rate >> 1, mtu, us_to_ns(q->target), us_to_ns(q->interval)); cake_set_rate(&q->tins[3], rate >> 2, mtu, us_to_ns(q->target), us_to_ns(q->interval)); /* bandwidth-sharing weights */ q->tins[0].tin_quantum = quantum; q->tins[1].tin_quantum = quantum >> 4; q->tins[2].tin_quantum = quantum >> 1; q->tins[3].tin_quantum = quantum >> 2; return 0; } static int cake_config_diffserv3(struct Qdisc *sch) { /* Simplified Diffserv structure with 3 tins. * Latency Sensitive (CS7, CS6, EF, VA, TOS4) * Best Effort * Low Priority (LE, CS1) */ struct cake_sched_data *q = qdisc_priv(sch); u32 mtu = psched_mtu(qdisc_dev(sch)); u64 rate = q->rate_bps; u32 quantum = 1024; q->tin_cnt = 3; /* codepoint to class mapping */ q->tin_index = diffserv3; q->tin_order = bulk_order; /* class characteristics */ cake_set_rate(&q->tins[0], rate, mtu, us_to_ns(q->target), us_to_ns(q->interval)); cake_set_rate(&q->tins[1], rate >> 4, mtu, us_to_ns(q->target), us_to_ns(q->interval)); cake_set_rate(&q->tins[2], rate >> 2, mtu, us_to_ns(q->target), us_to_ns(q->interval)); /* bandwidth-sharing weights */ q->tins[0].tin_quantum = quantum; q->tins[1].tin_quantum = quantum >> 4; q->tins[2].tin_quantum = quantum >> 2; return 0; } static void cake_reconfigure(struct Qdisc *sch) { struct cake_sched_data *q = qdisc_priv(sch); int c, ft; switch (q->tin_mode) { case CAKE_DIFFSERV_BESTEFFORT: ft = cake_config_besteffort(sch); break; case CAKE_DIFFSERV_PRECEDENCE: ft = cake_config_precedence(sch); break; case CAKE_DIFFSERV_DIFFSERV8: ft = cake_config_diffserv8(sch); break; case CAKE_DIFFSERV_DIFFSERV4: ft = cake_config_diffserv4(sch); break; case CAKE_DIFFSERV_DIFFSERV3: default: ft = cake_config_diffserv3(sch); break; } for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) { cake_clear_tin(sch, c); q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time; } q->rate_ns = q->tins[ft].tin_rate_ns; q->rate_shft = q->tins[ft].tin_rate_shft; if (q->buffer_config_limit) { q->buffer_limit = q->buffer_config_limit; } else if (q->rate_bps) { u64 t = q->rate_bps * q->interval; do_div(t, USEC_PER_SEC / 4); q->buffer_limit = max_t(u32, t, 4U << 20); } else { q->buffer_limit = ~0; } sch->flags &= ~TCQ_F_CAN_BYPASS; q->buffer_limit = min(q->buffer_limit, max(sch->limit * psched_mtu(qdisc_dev(sch)), q->buffer_config_limit)); } static int cake_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct cake_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_CAKE_MAX + 1]; u16 rate_flags; u8 flow_mode; int err; err = nla_parse_nested_deprecated(tb, TCA_CAKE_MAX, opt, cake_policy, extack); if (err < 0) return err; flow_mode = q->flow_mode; if (tb[TCA_CAKE_NAT]) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) flow_mode &= ~CAKE_FLOW_NAT_FLAG; flow_mode |= CAKE_FLOW_NAT_FLAG * !!nla_get_u32(tb[TCA_CAKE_NAT]); #else NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT], "No conntrack support in kernel"); return -EOPNOTSUPP; #endif } if (tb[TCA_CAKE_BASE_RATE64]) WRITE_ONCE(q->rate_bps, nla_get_u64(tb[TCA_CAKE_BASE_RATE64])); if (tb[TCA_CAKE_DIFFSERV_MODE]) WRITE_ONCE(q->tin_mode, nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE])); rate_flags = q->rate_flags; if (tb[TCA_CAKE_WASH]) { if (!!nla_get_u32(tb[TCA_CAKE_WASH])) rate_flags |= CAKE_FLAG_WASH; else rate_flags &= ~CAKE_FLAG_WASH; } if (tb[TCA_CAKE_FLOW_MODE]) flow_mode = ((flow_mode & CAKE_FLOW_NAT_FLAG) | (nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) & CAKE_FLOW_MASK)); if (tb[TCA_CAKE_ATM]) WRITE_ONCE(q->atm_mode, nla_get_u32(tb[TCA_CAKE_ATM])); if (tb[TCA_CAKE_OVERHEAD]) { WRITE_ONCE(q->rate_overhead, nla_get_s32(tb[TCA_CAKE_OVERHEAD])); rate_flags |= CAKE_FLAG_OVERHEAD; q->max_netlen = 0; q->max_adjlen = 0; q->min_netlen = ~0; q->min_adjlen = ~0; } if (tb[TCA_CAKE_RAW]) { rate_flags &= ~CAKE_FLAG_OVERHEAD; q->max_netlen = 0; q->max_adjlen = 0; q->min_netlen = ~0; q->min_adjlen = ~0; } if (tb[TCA_CAKE_MPU]) WRITE_ONCE(q->rate_mpu, nla_get_u32(tb[TCA_CAKE_MPU])); if (tb[TCA_CAKE_RTT]) { u32 interval = nla_get_u32(tb[TCA_CAKE_RTT]); WRITE_ONCE(q->interval, max(interval, 1U)); } if (tb[TCA_CAKE_TARGET]) { u32 target = nla_get_u32(tb[TCA_CAKE_TARGET]); WRITE_ONCE(q->target, max(target, 1U)); } if (tb[TCA_CAKE_AUTORATE]) { if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE])) rate_flags |= CAKE_FLAG_AUTORATE_INGRESS; else rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS; } if (tb[TCA_CAKE_INGRESS]) { if (!!nla_get_u32(tb[TCA_CAKE_INGRESS])) rate_flags |= CAKE_FLAG_INGRESS; else rate_flags &= ~CAKE_FLAG_INGRESS; } if (tb[TCA_CAKE_ACK_FILTER]) WRITE_ONCE(q->ack_filter, nla_get_u32(tb[TCA_CAKE_ACK_FILTER])); if (tb[TCA_CAKE_MEMORY]) WRITE_ONCE(q->buffer_config_limit, nla_get_u32(tb[TCA_CAKE_MEMORY])); if (tb[TCA_CAKE_SPLIT_GSO]) { if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO])) rate_flags |= CAKE_FLAG_SPLIT_GSO; else rate_flags &= ~CAKE_FLAG_SPLIT_GSO; } if (tb[TCA_CAKE_FWMARK]) { WRITE_ONCE(q->fwmark_mask, nla_get_u32(tb[TCA_CAKE_FWMARK])); WRITE_ONCE(q->fwmark_shft, q->fwmark_mask ? __ffs(q->fwmark_mask) : 0); } WRITE_ONCE(q->rate_flags, rate_flags); WRITE_ONCE(q->flow_mode, flow_mode); if (q->tins) { sch_tree_lock(sch); cake_reconfigure(sch); sch_tree_unlock(sch); } return 0; } static void cake_destroy(struct Qdisc *sch) { struct cake_sched_data *q = qdisc_priv(sch); qdisc_watchdog_cancel(&q->watchdog); tcf_block_put(q->block); kvfree(q->tins); } static int cake_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct cake_sched_data *q = qdisc_priv(sch); int i, j, err; sch->limit = 10240; q->tin_mode = CAKE_DIFFSERV_DIFFSERV3; q->flow_mode = CAKE_FLOW_TRIPLE; q->rate_bps = 0; /* unlimited by default */ q->interval = 100000; /* 100ms default */ q->target = 5000; /* 5ms: codel RFC argues * for 5 to 10% of interval */ q->rate_flags |= CAKE_FLAG_SPLIT_GSO; q->cur_tin = 0; q->cur_flow = 0; qdisc_watchdog_init(&q->watchdog, sch); if (opt) { err = cake_change(sch, opt, extack); if (err) return err; } err = tcf_block_get(&q->block, &q->filter_list, sch, extack); if (err) return err; quantum_div[0] = ~0; for (i = 1; i <= CAKE_QUEUES; i++) quantum_div[i] = 65535 / i; q->tins = kvcalloc(CAKE_MAX_TINS, sizeof(struct cake_tin_data), GFP_KERNEL); if (!q->tins) return -ENOMEM; for (i = 0; i < CAKE_MAX_TINS; i++) { struct cake_tin_data *b = q->tins + i; INIT_LIST_HEAD(&b->new_flows); INIT_LIST_HEAD(&b->old_flows); INIT_LIST_HEAD(&b->decaying_flows); b->sparse_flow_count = 0; b->bulk_flow_count = 0; b->decaying_flow_count = 0; for (j = 0; j < CAKE_QUEUES; j++) { struct cake_flow *flow = b->flows + j; u32 k = j * CAKE_MAX_TINS + i; INIT_LIST_HEAD(&flow->flowchain); cobalt_vars_init(&flow->cvars); q->overflow_heap[k].t = i; q->overflow_heap[k].b = j; b->overflow_idx[j] = k; } } cake_reconfigure(sch); q->avg_peak_bandwidth = q->rate_bps; q->min_netlen = ~0; q->min_adjlen = ~0; return 0; } static int cake_dump(struct Qdisc *sch, struct sk_buff *skb) { struct cake_sched_data *q = qdisc_priv(sch); struct nlattr *opts; u16 rate_flags; u8 flow_mode; opts = nla_nest_start_noflag(skb, TCA_OPTIONS); if (!opts) goto nla_put_failure; if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, READ_ONCE(q->rate_bps), TCA_CAKE_PAD)) goto nla_put_failure; flow_mode = READ_ONCE(q->flow_mode); if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE, flow_mode & CAKE_FLOW_MASK)) goto nla_put_failure; if (nla_put_u32(skb, TCA_CAKE_RTT, READ_ONCE(q->interval))) goto nla_put_failure; if (nla_put_u32(skb, TCA_CAKE_TARGET, READ_ONCE(q->target))) goto nla_put_failure; if (nla_put_u32(skb, TCA_CAKE_MEMORY, READ_ONCE(q->buffer_config_limit))) goto nla_put_failure; rate_flags = READ_ONCE(q->rate_flags); if (nla_put_u32(skb, TCA_CAKE_AUTORATE, !!(rate_flags & CAKE_FLAG_AUTORATE_INGRESS))) goto nla_put_failure; if (nla_put_u32(skb, TCA_CAKE_INGRESS, !!(rate_flags & CAKE_FLAG_INGRESS))) goto nla_put_failure; if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, READ_ONCE(q->ack_filter))) goto nla_put_failure; if (nla_put_u32(skb, TCA_CAKE_NAT, !!(flow_mode & CAKE_FLOW_NAT_FLAG))) goto nla_put_failure; if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, READ_ONCE(q->tin_mode))) goto nla_put_failure; if (nla_put_u32(skb, TCA_CAKE_WASH, !!(rate_flags & CAKE_FLAG_WASH))) goto nla_put_failure; if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, READ_ONCE(q->rate_overhead))) goto nla_put_failure; if (!(rate_flags & CAKE_FLAG_OVERHEAD)) if (nla_put_u32(skb, TCA_CAKE_RAW, 0)) goto nla_put_failure; if (nla_put_u32(skb, TCA_CAKE_ATM, READ_ONCE(q->atm_mode))) goto nla_put_failure; if (nla_put_u32(skb, TCA_CAKE_MPU, READ_ONCE(q->rate_mpu))) goto nla_put_failure; if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO, !!(rate_flags & CAKE_FLAG_SPLIT_GSO))) goto nla_put_failure; if (nla_put_u32(skb, TCA_CAKE_FWMARK, READ_ONCE(q->fwmark_mask))) goto nla_put_failure; return nla_nest_end(skb, opts); nla_put_failure: return -1; } static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d) { struct nlattr *stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP); struct cake_sched_data *q = qdisc_priv(sch); struct nlattr *tstats, *ts; int i; if (!stats) return -1; #define PUT_STAT_U32(attr, data) do { \ if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \ goto nla_put_failure; \ } while (0) #define PUT_STAT_U64(attr, data) do { \ if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \ data, TCA_CAKE_STATS_PAD)) \ goto nla_put_failure; \ } while (0) PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth); PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit); PUT_STAT_U32(MEMORY_USED, q->buffer_max_used); PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16)); PUT_STAT_U32(MAX_NETLEN, q->max_netlen); PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen); PUT_STAT_U32(MIN_NETLEN, q->min_netlen); PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen); #undef PUT_STAT_U32 #undef PUT_STAT_U64 tstats = nla_nest_start_noflag(d->skb, TCA_CAKE_STATS_TIN_STATS); if (!tstats) goto nla_put_failure; #define PUT_TSTAT_U32(attr, data) do { \ if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \ goto nla_put_failure; \ } while (0) #define PUT_TSTAT_U64(attr, data) do { \ if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \ data, TCA_CAKE_TIN_STATS_PAD)) \ goto nla_put_failure; \ } while (0) for (i = 0; i < q->tin_cnt; i++) { struct cake_tin_data *b = &q->tins[q->tin_order[i]]; ts = nla_nest_start_noflag(d->skb, i + 1); if (!ts) goto nla_put_failure; PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps); PUT_TSTAT_U64(SENT_BYTES64, b->bytes); PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog); PUT_TSTAT_U32(TARGET_US, ktime_to_us(ns_to_ktime(b->cparams.target))); PUT_TSTAT_U32(INTERVAL_US, ktime_to_us(ns_to_ktime(b->cparams.interval))); PUT_TSTAT_U32(SENT_PACKETS, b->packets); PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped); PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark); PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops); PUT_TSTAT_U32(PEAK_DELAY_US, ktime_to_us(ns_to_ktime(b->peak_delay))); PUT_TSTAT_U32(AVG_DELAY_US, ktime_to_us(ns_to_ktime(b->avge_delay))); PUT_TSTAT_U32(BASE_DELAY_US, ktime_to_us(ns_to_ktime(b->base_delay))); PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits); PUT_TSTAT_U32(WAY_MISSES, b->way_misses); PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions); PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count + b->decaying_flow_count); PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count); PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count); PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen); PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum); nla_nest_end(d->skb, ts); } #undef PUT_TSTAT_U32 #undef PUT_TSTAT_U64 nla_nest_end(d->skb, tstats); return nla_nest_end(d->skb, stats); nla_put_failure: nla_nest_cancel(d->skb, stats); return -1; } static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg) { return NULL; } static unsigned long cake_find(struct Qdisc *sch, u32 classid) { return 0; } static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent, u32 classid) { return 0; } static void cake_unbind(struct Qdisc *q, unsigned long cl) { } static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl, struct netlink_ext_ack *extack) { struct cake_sched_data *q = qdisc_priv(sch); if (cl) return NULL; return q->block; } static int cake_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) { tcm->tcm_handle |= TC_H_MIN(cl); return 0; } static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct gnet_dump *d) { struct cake_sched_data *q = qdisc_priv(sch); const struct cake_flow *flow = NULL; struct gnet_stats_queue qs = { 0 }; struct nlattr *stats; u32 idx = cl - 1; if (idx < CAKE_QUEUES * q->tin_cnt) { const struct cake_tin_data *b = \ &q->tins[q->tin_order[idx / CAKE_QUEUES]]; const struct sk_buff *skb; flow = &b->flows[idx % CAKE_QUEUES]; if (flow->head) { sch_tree_lock(sch); skb = flow->head; while (skb) { qs.qlen++; skb = skb->next; } sch_tree_unlock(sch); } qs.backlog = b->backlogs[idx % CAKE_QUEUES]; qs.drops = flow->dropped; } if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0) return -1; if (flow) { ktime_t now = ktime_get(); stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP); if (!stats) return -1; #define PUT_STAT_U32(attr, data) do { \ if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \ goto nla_put_failure; \ } while (0) #define PUT_STAT_S32(attr, data) do { \ if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \ goto nla_put_failure; \ } while (0) PUT_STAT_S32(DEFICIT, flow->deficit); PUT_STAT_U32(DROPPING, flow->cvars.dropping); PUT_STAT_U32(COBALT_COUNT, flow->cvars.count); PUT_STAT_U32(P_DROP, flow->cvars.p_drop); if (flow->cvars.p_drop) { PUT_STAT_S32(BLUE_TIMER_US, ktime_to_us( ktime_sub(now, flow->cvars.blue_timer))); } if (flow->cvars.dropping) { PUT_STAT_S32(DROP_NEXT_US, ktime_to_us( ktime_sub(now, flow->cvars.drop_next))); } if (nla_nest_end(d->skb, stats) < 0) return -1; } return 0; nla_put_failure: nla_nest_cancel(d->skb, stats); return -1; } static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg) { struct cake_sched_data *q = qdisc_priv(sch); unsigned int i, j; if (arg->stop) return; for (i = 0; i < q->tin_cnt; i++) { struct cake_tin_data *b = &q->tins[q->tin_order[i]]; for (j = 0; j < CAKE_QUEUES; j++) { if (list_empty(&b->flows[j].flowchain)) { arg->count++; continue; } if (!tc_qdisc_stats_dump(sch, i * CAKE_QUEUES + j + 1, arg)) break; } } } static const struct Qdisc_class_ops cake_class_ops = { .leaf = cake_leaf, .find = cake_find, .tcf_block = cake_tcf_block, .bind_tcf = cake_bind, .unbind_tcf = cake_unbind, .dump = cake_dump_class, .dump_stats = cake_dump_class_stats, .walk = cake_walk, }; static struct Qdisc_ops cake_qdisc_ops __read_mostly = { .cl_ops = &cake_class_ops, .id = "cake", .priv_size = sizeof(struct cake_sched_data), .enqueue = cake_enqueue, .dequeue = cake_dequeue, .peek = qdisc_peek_dequeued, .init = cake_init, .reset = cake_reset, .destroy = cake_destroy, .change = cake_change, .dump = cake_dump, .dump_stats = cake_dump_stats, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_SCH("cake"); static int __init cake_module_init(void) { return register_qdisc(&cake_qdisc_ops); } static void __exit cake_module_exit(void) { unregister_qdisc(&cake_qdisc_ops); } module_init(cake_module_init) module_exit(cake_module_exit) MODULE_AUTHOR("Jonathan Morton"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_DESCRIPTION("The CAKE shaper.");
29 29 122 1 1 1 7 1 6 5 5 5 2 5 5 5 5 1 1 1 1 1 16 16 16 16 5 15 16 16 2 2 3 3 3 3 3 3 3 3 3 6 3 4 3 3 3 3 4 4 4 4 4 1 1 8 8 8 8 6 6 6 5 5 5 4 1 2 1 2 3 6 123 1 122 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 // SPDX-License-Identifier: GPL-2.0+ /* * NILFS segment usage file. * * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. * * Written by Koji Sato. * Revised by Ryusuke Konishi. */ #include <linux/kernel.h> #include <linux/fs.h> #include <linux/string.h> #include <linux/buffer_head.h> #include <linux/errno.h> #include "mdt.h" #include "sufile.h" #include <trace/events/nilfs2.h> /** * struct nilfs_sufile_info - on-memory private data of sufile * @mi: on-memory private data of metadata file * @ncleansegs: number of clean segments * @allocmin: lower limit of allocatable segment range * @allocmax: upper limit of allocatable segment range */ struct nilfs_sufile_info { struct nilfs_mdt_info mi; unsigned long ncleansegs;/* number of clean segments */ __u64 allocmin; /* lower limit of allocatable segment range */ __u64 allocmax; /* upper limit of allocatable segment range */ }; static inline struct nilfs_sufile_info *NILFS_SUI(struct inode *sufile) { return (struct nilfs_sufile_info *)NILFS_MDT(sufile); } static inline unsigned long nilfs_sufile_segment_usages_per_block(const struct inode *sufile) { return NILFS_MDT(sufile)->mi_entries_per_block; } static unsigned long nilfs_sufile_get_blkoff(const struct inode *sufile, __u64 segnum) { __u64 t = segnum + NILFS_MDT(sufile)->mi_first_entry_offset; t = div64_ul(t, nilfs_sufile_segment_usages_per_block(sufile)); return (unsigned long)t; } static unsigned long nilfs_sufile_get_offset(const struct inode *sufile, __u64 segnum) { __u64 t = segnum + NILFS_MDT(sufile)->mi_first_entry_offset; return do_div(t, nilfs_sufile_segment_usages_per_block(sufile)); } static unsigned long nilfs_sufile_segment_usages_in_block(const struct inode *sufile, __u64 curr, __u64 max) { return min_t(unsigned long, nilfs_sufile_segment_usages_per_block(sufile) - nilfs_sufile_get_offset(sufile, curr), max - curr + 1); } /** * nilfs_sufile_segment_usage_offset - calculate the byte offset of a segment * usage entry in the folio containing it * @sufile: segment usage file inode * @segnum: number of segment usage * @bh: buffer head of block containing segment usage indexed by @segnum * * Return: Byte offset in the folio of the segment usage entry. */ static size_t nilfs_sufile_segment_usage_offset(const struct inode *sufile, __u64 segnum, struct buffer_head *bh) { return offset_in_folio(bh->b_folio, bh->b_data) + nilfs_sufile_get_offset(sufile, segnum) * NILFS_MDT(sufile)->mi_entry_size; } static int nilfs_sufile_get_header_block(struct inode *sufile, struct buffer_head **bhp) { int err = nilfs_mdt_get_block(sufile, 0, 0, NULL, bhp); if (unlikely(err == -ENOENT)) { nilfs_error(sufile->i_sb, "missing header block in segment usage metadata"); err = -EIO; } return err; } static inline int nilfs_sufile_get_segment_usage_block(struct inode *sufile, __u64 segnum, int create, struct buffer_head **bhp) { return nilfs_mdt_get_block(sufile, nilfs_sufile_get_blkoff(sufile, segnum), create, NULL, bhp); } static int nilfs_sufile_delete_segment_usage_block(struct inode *sufile, __u64 segnum) { return nilfs_mdt_delete_block(sufile, nilfs_sufile_get_blkoff(sufile, segnum)); } static void nilfs_sufile_mod_counter(struct buffer_head *header_bh, u64 ncleanadd, u64 ndirtyadd) { struct nilfs_sufile_header *header; header = kmap_local_folio(header_bh->b_folio, 0); le64_add_cpu(&header->sh_ncleansegs, ncleanadd); le64_add_cpu(&header->sh_ndirtysegs, ndirtyadd); kunmap_local(header); mark_buffer_dirty(header_bh); } /** * nilfs_sufile_get_ncleansegs - return the number of clean segments * @sufile: inode of segment usage file * * Return: Number of clean segments. */ unsigned long nilfs_sufile_get_ncleansegs(struct inode *sufile) { return NILFS_SUI(sufile)->ncleansegs; } /** * nilfs_sufile_updatev - modify multiple segment usages at a time * @sufile: inode of segment usage file * @segnumv: array of segment numbers * @nsegs: size of @segnumv array * @create: creation flag * @ndone: place to store number of modified segments on @segnumv * @dofunc: primitive operation for the update * * Description: nilfs_sufile_updatev() repeatedly calls @dofunc * against the given array of segments. The @dofunc is called with * buffers of a header block and the sufile block in which the target * segment usage entry is contained. If @ndone is given, the number * of successfully modified segments from the head is stored in the * place @ndone points to. * * Return: 0 on success, or one of the following negative error codes on * failure: * * %-EINVAL - Invalid segment usage number * * %-EIO - I/O error (including metadata corruption). * * %-ENOENT - Given segment usage is in hole block (may be returned if * @create is zero) * * %-ENOMEM - Insufficient memory available. */ int nilfs_sufile_updatev(struct inode *sufile, __u64 *segnumv, size_t nsegs, int create, size_t *ndone, void (*dofunc)(struct inode *, __u64, struct buffer_head *, struct buffer_head *)) { struct buffer_head *header_bh, *bh; unsigned long blkoff, prev_blkoff; __u64 *seg; size_t nerr = 0, n = 0; int ret = 0; if (unlikely(nsegs == 0)) goto out; down_write(&NILFS_MDT(sufile)->mi_sem); for (seg = segnumv; seg < segnumv + nsegs; seg++) { if (unlikely(*seg >= nilfs_sufile_get_nsegments(sufile))) { nilfs_warn(sufile->i_sb, "%s: invalid segment number: %llu", __func__, (unsigned long long)*seg); nerr++; } } if (nerr > 0) { ret = -EINVAL; goto out_sem; } ret = nilfs_sufile_get_header_block(sufile, &header_bh); if (ret < 0) goto out_sem; seg = segnumv; blkoff = nilfs_sufile_get_blkoff(sufile, *seg); ret = nilfs_mdt_get_block(sufile, blkoff, create, NULL, &bh); if (ret < 0) goto out_header; for (;;) { dofunc(sufile, *seg, header_bh, bh); if (++seg >= segnumv + nsegs) break; prev_blkoff = blkoff; blkoff = nilfs_sufile_get_blkoff(sufile, *seg); if (blkoff == prev_blkoff) continue; /* get different block */ brelse(bh); ret = nilfs_mdt_get_block(sufile, blkoff, create, NULL, &bh); if (unlikely(ret < 0)) goto out_header; } brelse(bh); out_header: n = seg - segnumv; brelse(header_bh); out_sem: up_write(&NILFS_MDT(sufile)->mi_sem); out: if (ndone) *ndone = n; return ret; } int nilfs_sufile_update(struct inode *sufile, __u64 segnum, int create, void (*dofunc)(struct inode *, __u64, struct buffer_head *, struct buffer_head *)) { struct buffer_head *header_bh, *bh; int ret; if (unlikely(segnum >= nilfs_sufile_get_nsegments(sufile))) { nilfs_warn(sufile->i_sb, "%s: invalid segment number: %llu", __func__, (unsigned long long)segnum); return -EINVAL; } down_write(&NILFS_MDT(sufile)->mi_sem); ret = nilfs_sufile_get_header_block(sufile, &header_bh); if (ret < 0) goto out_sem; ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, create, &bh); if (!ret) { dofunc(sufile, segnum, header_bh, bh); brelse(bh); } brelse(header_bh); out_sem: up_write(&NILFS_MDT(sufile)->mi_sem); return ret; } /** * nilfs_sufile_set_alloc_range - limit range of segment to be allocated * @sufile: inode of segment usage file * @start: minimum segment number of allocatable region (inclusive) * @end: maximum segment number of allocatable region (inclusive) * * Return: 0 on success, or %-ERANGE if segment range is invalid. */ int nilfs_sufile_set_alloc_range(struct inode *sufile, __u64 start, __u64 end) { struct nilfs_sufile_info *sui = NILFS_SUI(sufile); __u64 nsegs; int ret = -ERANGE; down_write(&NILFS_MDT(sufile)->mi_sem); nsegs = nilfs_sufile_get_nsegments(sufile); if (start <= end && end < nsegs) { sui->allocmin = start; sui->allocmax = end; ret = 0; } up_write(&NILFS_MDT(sufile)->mi_sem); return ret; } /** * nilfs_sufile_alloc - allocate a segment * @sufile: inode of segment usage file * @segnump: pointer to segment number * * Description: nilfs_sufile_alloc() allocates a clean segment, and stores * its segment number in the place pointed to by @segnump. * * Return: 0 on success, or one of the following negative error codes on * failure: * * %-EIO - I/O error (including metadata corruption). * * %-ENOMEM - Insufficient memory available. * * %-ENOSPC - No clean segment left. */ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump) { struct buffer_head *header_bh, *su_bh; struct nilfs_sufile_header *header; struct nilfs_segment_usage *su; struct nilfs_sufile_info *sui = NILFS_SUI(sufile); size_t susz = NILFS_MDT(sufile)->mi_entry_size; __u64 segnum, maxsegnum, last_alloc; size_t offset; void *kaddr; unsigned long nsegments, nsus, cnt; int ret, j; down_write(&NILFS_MDT(sufile)->mi_sem); ret = nilfs_sufile_get_header_block(sufile, &header_bh); if (ret < 0) goto out_sem; header = kmap_local_folio(header_bh->b_folio, 0); last_alloc = le64_to_cpu(header->sh_last_alloc); kunmap_local(header); nsegments = nilfs_sufile_get_nsegments(sufile); maxsegnum = sui->allocmax; segnum = last_alloc + 1; if (segnum < sui->allocmin || segnum > sui->allocmax) segnum = sui->allocmin; for (cnt = 0; cnt < nsegments; cnt += nsus) { if (segnum > maxsegnum) { if (cnt < sui->allocmax - sui->allocmin + 1) { /* * wrap around in the limited region. * if allocation started from * sui->allocmin, this never happens. */ segnum = sui->allocmin; maxsegnum = last_alloc; } else if (segnum > sui->allocmin && sui->allocmax + 1 < nsegments) { segnum = sui->allocmax + 1; maxsegnum = nsegments - 1; } else if (sui->allocmin > 0) { segnum = 0; maxsegnum = sui->allocmin - 1; } else { break; /* never happens */ } } trace_nilfs2_segment_usage_check(sufile, segnum, cnt); ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1, &su_bh); if (ret < 0) goto out_header; offset = nilfs_sufile_segment_usage_offset(sufile, segnum, su_bh); su = kaddr = kmap_local_folio(su_bh->b_folio, offset); nsus = nilfs_sufile_segment_usages_in_block( sufile, segnum, maxsegnum); for (j = 0; j < nsus; j++, su = (void *)su + susz, segnum++) { if (!nilfs_segment_usage_clean(su)) continue; /* found a clean segment */ nilfs_segment_usage_set_dirty(su); kunmap_local(kaddr); header = kmap_local_folio(header_bh->b_folio, 0); le64_add_cpu(&header->sh_ncleansegs, -1); le64_add_cpu(&header->sh_ndirtysegs, 1); header->sh_last_alloc = cpu_to_le64(segnum); kunmap_local(header); sui->ncleansegs--; mark_buffer_dirty(header_bh); mark_buffer_dirty(su_bh); nilfs_mdt_mark_dirty(sufile); brelse(su_bh); *segnump = segnum; trace_nilfs2_segment_usage_allocated(sufile, segnum); goto out_header; } kunmap_local(kaddr); brelse(su_bh); } /* no segments left */ ret = -ENOSPC; out_header: brelse(header_bh); out_sem: up_write(&NILFS_MDT(sufile)->mi_sem); return ret; } void nilfs_sufile_do_cancel_free(struct inode *sufile, __u64 segnum, struct buffer_head *header_bh, struct buffer_head *su_bh) { struct nilfs_segment_usage *su; size_t offset; offset = nilfs_sufile_segment_usage_offset(sufile, segnum, su_bh); su = kmap_local_folio(su_bh->b_folio, offset); if (unlikely(!nilfs_segment_usage_clean(su))) { nilfs_warn(sufile->i_sb, "%s: segment %llu must be clean", __func__, (unsigned long long)segnum); kunmap_local(su); return; } nilfs_segment_usage_set_dirty(su); kunmap_local(su); nilfs_sufile_mod_counter(header_bh, -1, 1); NILFS_SUI(sufile)->ncleansegs--; mark_buffer_dirty(su_bh); nilfs_mdt_mark_dirty(sufile); } void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum, struct buffer_head *header_bh, struct buffer_head *su_bh) { struct nilfs_segment_usage *su; size_t offset; int clean, dirty; offset = nilfs_sufile_segment_usage_offset(sufile, segnum, su_bh); su = kmap_local_folio(su_bh->b_folio, offset); if (su->su_flags == cpu_to_le32(BIT(NILFS_SEGMENT_USAGE_DIRTY)) && su->su_nblocks == cpu_to_le32(0)) { kunmap_local(su); return; } clean = nilfs_segment_usage_clean(su); dirty = nilfs_segment_usage_dirty(su); /* make the segment garbage */ su->su_lastmod = cpu_to_le64(0); su->su_nblocks = cpu_to_le32(0); su->su_flags = cpu_to_le32(BIT(NILFS_SEGMENT_USAGE_DIRTY)); kunmap_local(su); nilfs_sufile_mod_counter(header_bh, clean ? (u64)-1 : 0, dirty ? 0 : 1); NILFS_SUI(sufile)->ncleansegs -= clean; mark_buffer_dirty(su_bh); nilfs_mdt_mark_dirty(sufile); } void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum, struct buffer_head *header_bh, struct buffer_head *su_bh) { struct nilfs_segment_usage *su; size_t offset; int sudirty; offset = nilfs_sufile_segment_usage_offset(sufile, segnum, su_bh); su = kmap_local_folio(su_bh->b_folio, offset); if (nilfs_segment_usage_clean(su)) { nilfs_warn(sufile->i_sb, "%s: segment %llu is already clean", __func__, (unsigned long long)segnum); kunmap_local(su); return; } if (unlikely(nilfs_segment_usage_error(su))) nilfs_warn(sufile->i_sb, "free segment %llu marked in error", (unsigned long long)segnum); sudirty = nilfs_segment_usage_dirty(su); if (unlikely(!sudirty)) nilfs_warn(sufile->i_sb, "free unallocated segment %llu", (unsigned long long)segnum); nilfs_segment_usage_set_clean(su); kunmap_local(su); mark_buffer_dirty(su_bh); nilfs_sufile_mod_counter(header_bh, 1, sudirty ? (u64)-1 : 0); NILFS_SUI(sufile)->ncleansegs++; nilfs_mdt_mark_dirty(sufile); trace_nilfs2_segment_usage_freed(sufile, segnum); } /** * nilfs_sufile_mark_dirty - mark the buffer having a segment usage dirty * @sufile: inode of segment usage file * @segnum: segment number * * Return: 0 on success, or a negative error code on failure. */ int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum) { struct buffer_head *bh; size_t offset; struct nilfs_segment_usage *su; int ret; down_write(&NILFS_MDT(sufile)->mi_sem); ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh); if (unlikely(ret)) { if (ret == -ENOENT) { nilfs_error(sufile->i_sb, "segment usage for segment %llu is unreadable due to a hole block", (unsigned long long)segnum); ret = -EIO; } goto out_sem; } offset = nilfs_sufile_segment_usage_offset(sufile, segnum, bh); su = kmap_local_folio(bh->b_folio, offset); if (unlikely(nilfs_segment_usage_error(su))) { struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; kunmap_local(su); brelse(bh); if (nilfs_segment_is_active(nilfs, segnum)) { nilfs_error(sufile->i_sb, "active segment %llu is erroneous", (unsigned long long)segnum); } else { /* * Segments marked erroneous are never allocated by * nilfs_sufile_alloc(); only active segments, ie, * the segments indexed by ns_segnum or ns_nextnum, * can be erroneous here. */ WARN_ON_ONCE(1); } ret = -EIO; } else { nilfs_segment_usage_set_dirty(su); kunmap_local(su); mark_buffer_dirty(bh); nilfs_mdt_mark_dirty(sufile); brelse(bh); } out_sem: up_write(&NILFS_MDT(sufile)->mi_sem); return ret; } /** * nilfs_sufile_set_segment_usage - set usage of a segment * @sufile: inode of segment usage file * @segnum: segment number * @nblocks: number of live blocks in the segment * @modtime: modification time (option) * * Return: 0 on success, or a negative error code on failure. */ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum, unsigned long nblocks, time64_t modtime) { struct buffer_head *bh; struct nilfs_segment_usage *su; size_t offset; int ret; down_write(&NILFS_MDT(sufile)->mi_sem); ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh); if (ret < 0) goto out_sem; offset = nilfs_sufile_segment_usage_offset(sufile, segnum, bh); su = kmap_local_folio(bh->b_folio, offset); if (modtime) { /* * Check segusage error and set su_lastmod only when updating * this entry with a valid timestamp, not for cancellation. */ WARN_ON_ONCE(nilfs_segment_usage_error(su)); su->su_lastmod = cpu_to_le64(modtime); } su->su_nblocks = cpu_to_le32(nblocks); kunmap_local(su); mark_buffer_dirty(bh); nilfs_mdt_mark_dirty(sufile); brelse(bh); out_sem: up_write(&NILFS_MDT(sufile)->mi_sem); return ret; } /** * nilfs_sufile_get_stat - get segment usage statistics * @sufile: inode of segment usage file * @sustat: pointer to a structure of segment usage statistics * * Description: nilfs_sufile_get_stat() retrieves segment usage statistics * and stores them in the location pointed to by @sustat. * * Return: 0 on success, or one of the following negative error codes on * failure: * * %-EIO - I/O error (including metadata corruption). * * %-ENOMEM - Insufficient memory available. */ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat) { struct buffer_head *header_bh; struct nilfs_sufile_header *header; struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; int ret; down_read(&NILFS_MDT(sufile)->mi_sem); ret = nilfs_sufile_get_header_block(sufile, &header_bh); if (ret < 0) goto out_sem; header = kmap_local_folio(header_bh->b_folio, 0); sustat->ss_nsegs = nilfs_sufile_get_nsegments(sufile); sustat->ss_ncleansegs = le64_to_cpu(header->sh_ncleansegs); sustat->ss_ndirtysegs = le64_to_cpu(header->sh_ndirtysegs); sustat->ss_ctime = nilfs->ns_ctime; sustat->ss_nongc_ctime = nilfs->ns_nongc_ctime; spin_lock(&nilfs->ns_last_segment_lock); sustat->ss_prot_seq = nilfs->ns_prot_seq; spin_unlock(&nilfs->ns_last_segment_lock); kunmap_local(header); brelse(header_bh); out_sem: up_read(&NILFS_MDT(sufile)->mi_sem); return ret; } void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum, struct buffer_head *header_bh, struct buffer_head *su_bh) { struct nilfs_segment_usage *su; size_t offset; int suclean; offset = nilfs_sufile_segment_usage_offset(sufile, segnum, su_bh); su = kmap_local_folio(su_bh->b_folio, offset); if (nilfs_segment_usage_error(su)) { kunmap_local(su); return; } suclean = nilfs_segment_usage_clean(su); nilfs_segment_usage_set_error(su); kunmap_local(su); if (suclean) { nilfs_sufile_mod_counter(header_bh, -1, 0); NILFS_SUI(sufile)->ncleansegs--; } mark_buffer_dirty(su_bh); nilfs_mdt_mark_dirty(sufile); } /** * nilfs_sufile_truncate_range - truncate range of segment array * @sufile: inode of segment usage file * @start: start segment number (inclusive) * @end: end segment number (inclusive) * * Return: 0 on success, or one of the following negative error codes on * failure: * * %-EBUSY - Dirty or active segments are present in the range. * * %-EINVAL - Invalid number of segments specified. * * %-EIO - I/O error (including metadata corruption). * * %-ENOMEM - Insufficient memory available. */ static int nilfs_sufile_truncate_range(struct inode *sufile, __u64 start, __u64 end) { struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; struct buffer_head *header_bh; struct buffer_head *su_bh; struct nilfs_segment_usage *su, *su2; size_t susz = NILFS_MDT(sufile)->mi_entry_size; unsigned long segusages_per_block; unsigned long nsegs, ncleaned; __u64 segnum; size_t offset; ssize_t n, nc; int ret; int j; nsegs = nilfs_sufile_get_nsegments(sufile); ret = -EINVAL; if (start > end || start >= nsegs) goto out; ret = nilfs_sufile_get_header_block(sufile, &header_bh); if (ret < 0) goto out; segusages_per_block = nilfs_sufile_segment_usages_per_block(sufile); ncleaned = 0; for (segnum = start; segnum <= end; segnum += n) { n = min_t(unsigned long, segusages_per_block - nilfs_sufile_get_offset(sufile, segnum), end - segnum + 1); ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &su_bh); if (ret < 0) { if (ret != -ENOENT) goto out_header; /* hole */ continue; } offset = nilfs_sufile_segment_usage_offset(sufile, segnum, su_bh); su = kmap_local_folio(su_bh->b_folio, offset); su2 = su; for (j = 0; j < n; j++, su = (void *)su + susz) { if ((le32_to_cpu(su->su_flags) & ~BIT(NILFS_SEGMENT_USAGE_ERROR)) || nilfs_segment_is_active(nilfs, segnum + j)) { ret = -EBUSY; kunmap_local(su2); brelse(su_bh); goto out_header; } } nc = 0; for (su = su2, j = 0; j < n; j++, su = (void *)su + susz) { if (nilfs_segment_usage_error(su)) { nilfs_segment_usage_set_clean(su); nc++; } } kunmap_local(su2); if (nc > 0) { mark_buffer_dirty(su_bh); ncleaned += nc; } brelse(su_bh); if (n == segusages_per_block) { /* make hole */ nilfs_sufile_delete_segment_usage_block(sufile, segnum); } } ret = 0; out_header: if (ncleaned > 0) { NILFS_SUI(sufile)->ncleansegs += ncleaned; nilfs_sufile_mod_counter(header_bh, ncleaned, 0); nilfs_mdt_mark_dirty(sufile); } brelse(header_bh); out: return ret; } /** * nilfs_sufile_resize - resize segment array * @sufile: inode of segment usage file * @newnsegs: new number of segments * * Return: 0 on success, or one of the following negative error codes on * failure: * * %-EBUSY - Dirty or active segments exist in the region to be truncated. * * %-EIO - I/O error (including metadata corruption). * * %-ENOMEM - Insufficient memory available. * * %-ENOSPC - Enough free space is not left for shrinking. */ int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs) { struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; struct buffer_head *header_bh; struct nilfs_sufile_header *header; struct nilfs_sufile_info *sui = NILFS_SUI(sufile); unsigned long nsegs, nrsvsegs; int ret = 0; down_write(&NILFS_MDT(sufile)->mi_sem); nsegs = nilfs_sufile_get_nsegments(sufile); if (nsegs == newnsegs) goto out; ret = -ENOSPC; nrsvsegs = nilfs_nrsvsegs(nilfs, newnsegs); if (newnsegs < nsegs && nsegs - newnsegs + nrsvsegs > sui->ncleansegs) goto out; ret = nilfs_sufile_get_header_block(sufile, &header_bh); if (ret < 0) goto out; if (newnsegs > nsegs) { sui->ncleansegs += newnsegs - nsegs; } else /* newnsegs < nsegs */ { ret = nilfs_sufile_truncate_range(sufile, newnsegs, nsegs - 1); if (ret < 0) goto out_header; sui->ncleansegs -= nsegs - newnsegs; /* * If the sufile is successfully truncated, immediately adjust * the segment allocation space while locking the semaphore * "mi_sem" so that nilfs_sufile_alloc() never allocates * segments in the truncated space. */ sui->allocmax = newnsegs - 1; sui->allocmin = 0; } header = kmap_local_folio(header_bh->b_folio, 0); header->sh_ncleansegs = cpu_to_le64(sui->ncleansegs); kunmap_local(header); mark_buffer_dirty(header_bh); nilfs_mdt_mark_dirty(sufile); nilfs_set_nsegments(nilfs, newnsegs); out_header: brelse(header_bh); out: up_write(&NILFS_MDT(sufile)->mi_sem); return ret; } /** * nilfs_sufile_get_suinfo - get segment usage information * @sufile: inode of segment usage file * @segnum: segment number to start looking * @buf: array of suinfo * @sisz: byte size of suinfo * @nsi: size of suinfo array * * Return: Count of segment usage info items stored in the output buffer on * success, or one of the following negative error codes on failure: * * %-EIO - I/O error (including metadata corruption). * * %-ENOMEM - Insufficient memory available. */ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf, unsigned int sisz, size_t nsi) { struct buffer_head *su_bh; struct nilfs_segment_usage *su; struct nilfs_suinfo *si = buf; size_t susz = NILFS_MDT(sufile)->mi_entry_size; struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; size_t offset; void *kaddr; unsigned long nsegs, segusages_per_block; ssize_t n; int ret, i, j; down_read(&NILFS_MDT(sufile)->mi_sem); segusages_per_block = nilfs_sufile_segment_usages_per_block(sufile); nsegs = min_t(unsigned long, nilfs_sufile_get_nsegments(sufile) - segnum, nsi); for (i = 0; i < nsegs; i += n, segnum += n) { n = min_t(unsigned long, segusages_per_block - nilfs_sufile_get_offset(sufile, segnum), nsegs - i); ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &su_bh); if (ret < 0) { if (ret != -ENOENT) goto out; /* hole */ memset(si, 0, sisz * n); si = (void *)si + sisz * n; continue; } offset = nilfs_sufile_segment_usage_offset(sufile, segnum, su_bh); su = kaddr = kmap_local_folio(su_bh->b_folio, offset); for (j = 0; j < n; j++, su = (void *)su + susz, si = (void *)si + sisz) { si->sui_lastmod = le64_to_cpu(su->su_lastmod); si->sui_nblocks = le32_to_cpu(su->su_nblocks); si->sui_flags = le32_to_cpu(su->su_flags) & ~BIT(NILFS_SEGMENT_USAGE_ACTIVE); if (nilfs_segment_is_active(nilfs, segnum + j)) si->sui_flags |= BIT(NILFS_SEGMENT_USAGE_ACTIVE); } kunmap_local(kaddr); brelse(su_bh); } ret = nsegs; out: up_read(&NILFS_MDT(sufile)->mi_sem); return ret; } /** * nilfs_sufile_set_suinfo - sets segment usage info * @sufile: inode of segment usage file * @buf: array of suinfo_update * @supsz: byte size of suinfo_update * @nsup: size of suinfo_update array * * Description: Takes an array of nilfs_suinfo_update structs and updates * segment usage accordingly. Only the fields indicated by the sup_flags * are updated. * * Return: 0 on success, or one of the following negative error codes on * failure: * * %-EINVAL - Invalid values in input (segment number, flags or nblocks). * * %-EIO - I/O error (including metadata corruption). * * %-ENOMEM - Insufficient memory available. */ ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, void *buf, unsigned int supsz, size_t nsup) { struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; struct buffer_head *header_bh, *bh; struct nilfs_suinfo_update *sup, *supend = buf + supsz * nsup; struct nilfs_segment_usage *su; size_t offset; unsigned long blkoff, prev_blkoff; int cleansi, cleansu, dirtysi, dirtysu; long ncleaned = 0, ndirtied = 0; int ret = 0; if (unlikely(nsup == 0)) return ret; for (sup = buf; sup < supend; sup = (void *)sup + supsz) { if (sup->sup_segnum >= nilfs->ns_nsegments || (sup->sup_flags & (~0UL << __NR_NILFS_SUINFO_UPDATE_FIELDS)) || (nilfs_suinfo_update_nblocks(sup) && sup->sup_sui.sui_nblocks > nilfs->ns_blocks_per_segment)) return -EINVAL; } down_write(&NILFS_MDT(sufile)->mi_sem); ret = nilfs_sufile_get_header_block(sufile, &header_bh); if (ret < 0) goto out_sem; sup = buf; blkoff = nilfs_sufile_get_blkoff(sufile, sup->sup_segnum); ret = nilfs_mdt_get_block(sufile, blkoff, 1, NULL, &bh); if (ret < 0) goto out_header; for (;;) { offset = nilfs_sufile_segment_usage_offset( sufile, sup->sup_segnum, bh); su = kmap_local_folio(bh->b_folio, offset); if (nilfs_suinfo_update_lastmod(sup)) su->su_lastmod = cpu_to_le64(sup->sup_sui.sui_lastmod); if (nilfs_suinfo_update_nblocks(sup)) su->su_nblocks = cpu_to_le32(sup->sup_sui.sui_nblocks); if (nilfs_suinfo_update_flags(sup)) { /* * Active flag is a virtual flag projected by running * nilfs kernel code - drop it not to write it to * disk. */ sup->sup_sui.sui_flags &= ~BIT(NILFS_SEGMENT_USAGE_ACTIVE); cleansi = nilfs_suinfo_clean(&sup->sup_sui); cleansu = nilfs_segment_usage_clean(su); dirtysi = nilfs_suinfo_dirty(&sup->sup_sui); dirtysu = nilfs_segment_usage_dirty(su); if (cleansi && !cleansu) ++ncleaned; else if (!cleansi && cleansu) --ncleaned; if (dirtysi && !dirtysu) ++ndirtied; else if (!dirtysi && dirtysu) --ndirtied; su->su_flags = cpu_to_le32(sup->sup_sui.sui_flags); } kunmap_local(su); sup = (void *)sup + supsz; if (sup >= supend) break; prev_blkoff = blkoff; blkoff = nilfs_sufile_get_blkoff(sufile, sup->sup_segnum); if (blkoff == prev_blkoff) continue; /* get different block */ mark_buffer_dirty(bh); put_bh(bh); ret = nilfs_mdt_get_block(sufile, blkoff, 1, NULL, &bh); if (unlikely(ret < 0)) goto out_mark; } mark_buffer_dirty(bh); put_bh(bh); out_mark: if (ncleaned || ndirtied) { nilfs_sufile_mod_counter(header_bh, (u64)ncleaned, (u64)ndirtied); NILFS_SUI(sufile)->ncleansegs += ncleaned; } nilfs_mdt_mark_dirty(sufile); out_header: put_bh(header_bh); out_sem: up_write(&NILFS_MDT(sufile)->mi_sem); return ret; } /** * nilfs_sufile_trim_fs() - trim ioctl handle function * @sufile: inode of segment usage file * @range: fstrim_range structure * * start: First Byte to trim * len: number of Bytes to trim from start * minlen: minimum extent length in Bytes * * Decription: nilfs_sufile_trim_fs goes through all segments containing bytes * from start to start+len. start is rounded up to the next block boundary * and start+len is rounded down. For each clean segment blkdev_issue_discard * function is invoked. * * Return: 0 on success, or a negative error code on failure. */ int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range) { struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; struct buffer_head *su_bh; struct nilfs_segment_usage *su; size_t offset; void *kaddr; size_t n, i, susz = NILFS_MDT(sufile)->mi_entry_size; sector_t seg_start, seg_end, start_block, end_block; sector_t start = 0, nblocks = 0; u64 segnum, segnum_end, minlen, len, max_blocks, ndiscarded = 0; int ret = 0; unsigned int sects_per_block; sects_per_block = (1 << nilfs->ns_blocksize_bits) / bdev_logical_block_size(nilfs->ns_bdev); len = range->len >> nilfs->ns_blocksize_bits; minlen = range->minlen >> nilfs->ns_blocksize_bits; max_blocks = ((u64)nilfs->ns_nsegments * nilfs->ns_blocks_per_segment); if (!len || range->start >= max_blocks << nilfs->ns_blocksize_bits) return -EINVAL; start_block = (range->start + nilfs->ns_blocksize - 1) >> nilfs->ns_blocksize_bits; /* * range->len can be very large (actually, it is set to * ULLONG_MAX by default) - truncate upper end of the range * carefully so as not to overflow. */ if (max_blocks - start_block < len) end_block = max_blocks - 1; else end_block = start_block + len - 1; segnum = nilfs_get_segnum_of_block(nilfs, start_block); segnum_end = nilfs_get_segnum_of_block(nilfs, end_block); down_read(&NILFS_MDT(sufile)->mi_sem); while (segnum <= segnum_end) { n = nilfs_sufile_segment_usages_in_block(sufile, segnum, segnum_end); ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &su_bh); if (ret < 0) { if (ret != -ENOENT) goto out_sem; /* hole */ segnum += n; continue; } offset = nilfs_sufile_segment_usage_offset(sufile, segnum, su_bh); su = kaddr = kmap_local_folio(su_bh->b_folio, offset); for (i = 0; i < n; ++i, ++segnum, su = (void *)su + susz) { if (!nilfs_segment_usage_clean(su)) continue; nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end); if (!nblocks) { /* start new extent */ start = seg_start; nblocks = seg_end - seg_start + 1; continue; } if (start + nblocks == seg_start) { /* add to previous extent */ nblocks += seg_end - seg_start + 1; continue; } /* discard previous extent */ if (start < start_block) { nblocks -= start_block - start; start = start_block; } if (nblocks >= minlen) { kunmap_local(kaddr); ret = blkdev_issue_discard(nilfs->ns_bdev, start * sects_per_block, nblocks * sects_per_block, GFP_NOFS); if (ret < 0) { put_bh(su_bh); goto out_sem; } ndiscarded += nblocks; offset = nilfs_sufile_segment_usage_offset( sufile, segnum, su_bh); su = kaddr = kmap_local_folio(su_bh->b_folio, offset); } /* start new extent */ start = seg_start; nblocks = seg_end - seg_start + 1; } kunmap_local(kaddr); put_bh(su_bh); } if (nblocks) { /* discard last extent */ if (start < start_block) { nblocks -= start_block - start; start = start_block; } if (start + nblocks > end_block + 1) nblocks = end_block - start + 1; if (nblocks >= minlen) { ret = blkdev_issue_discard(nilfs->ns_bdev, start * sects_per_block, nblocks * sects_per_block, GFP_NOFS); if (!ret) ndiscarded += nblocks; } } out_sem: up_read(&NILFS_MDT(sufile)->mi_sem); range->len = ndiscarded << nilfs->ns_blocksize_bits; return ret; } /** * nilfs_sufile_read - read or get sufile inode * @sb: super block instance * @susize: size of a segment usage entry * @raw_inode: on-disk sufile inode * @inodep: buffer to store the inode * * Return: 0 on success, or a negative error code on failure. */ int nilfs_sufile_read(struct super_block *sb, size_t susize, struct nilfs_inode *raw_inode, struct inode **inodep) { struct inode *sufile; struct nilfs_sufile_info *sui; struct buffer_head *header_bh; struct nilfs_sufile_header *header; int err; if (susize > sb->s_blocksize) { nilfs_err(sb, "too large segment usage size: %zu bytes", susize); return -EINVAL; } else if (susize < NILFS_MIN_SEGMENT_USAGE_SIZE) { nilfs_err(sb, "too small segment usage size: %zu bytes", susize); return -EINVAL; } sufile = nilfs_iget_locked(sb, NULL, NILFS_SUFILE_INO); if (unlikely(!sufile)) return -ENOMEM; if (!(sufile->i_state & I_NEW)) goto out; err = nilfs_mdt_init(sufile, NILFS_MDT_GFP, sizeof(*sui)); if (err) goto failed; nilfs_mdt_set_entry_size(sufile, susize, sizeof(struct nilfs_sufile_header)); err = nilfs_read_inode_common(sufile, raw_inode); if (err) goto failed; err = nilfs_mdt_get_block(sufile, 0, 0, NULL, &header_bh); if (unlikely(err)) { if (err == -ENOENT) { nilfs_err(sb, "missing header block in segment usage metadata"); err = -EINVAL; } goto failed; } sui = NILFS_SUI(sufile); header = kmap_local_folio(header_bh->b_folio, 0); sui->ncleansegs = le64_to_cpu(header->sh_ncleansegs); kunmap_local(header); brelse(header_bh); sui->allocmax = nilfs_sufile_get_nsegments(sufile) - 1; sui->allocmin = 0; unlock_new_inode(sufile); out: *inodep = sufile; return 0; failed: iget_failed(sufile); return err; }
32 30 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 // SPDX-License-Identifier: GPL-2.0 #include <linux/net.h> #include <linux/uio.h> #include <net/sock.h> #include <linux/nospec.h> #include "rsrc.h" #define IO_NOTIF_UBUF_FLAGS (SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN) #define IO_NOTIF_SPLICE_BATCH 32 struct io_notif_data { struct file *file; struct ubuf_info uarg; struct io_notif_data *next; struct io_notif_data *head; unsigned account_pages; bool zc_report; bool zc_used; bool zc_copied; }; struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx); void io_tx_ubuf_complete(struct sk_buff *skb, struct ubuf_info *uarg, bool success); static inline struct io_notif_data *io_notif_to_data(struct io_kiocb *notif) { return io_kiocb_to_cmd(notif, struct io_notif_data); } static inline void io_notif_flush(struct io_kiocb *notif) __must_hold(&notif->ctx->uring_lock) { struct io_notif_data *nd = io_notif_to_data(notif); io_tx_ubuf_complete(NULL, &nd->uarg, true); } static inline int io_notif_account_mem(struct io_kiocb *notif, unsigned len) { struct io_ring_ctx *ctx = notif->ctx; struct io_notif_data *nd = io_notif_to_data(notif); unsigned nr_pages = (len >> PAGE_SHIFT) + 2; int ret; if (ctx->user) { ret = __io_account_mem(ctx->user, nr_pages); if (ret) return ret; nd->account_pages += nr_pages; } return 0; }
252 252 248 21 84 231 230 231 23 231 231 46 117 31 149 20 20 10 5 9 164 163 130 35 164 110 151 164 163 31 163 5 5 60 105 30 104 372 82 82 82 160 303 107 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 // SPDX-License-Identifier: GPL-2.0 /* * Tty buffer allocation management */ #include <linux/types.h> #include <linux/errno.h> #include <linux/minmax.h> #include <linux/tty.h> #include <linux/tty_buffer.h> #include <linux/tty_driver.h> #include <linux/tty_flip.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/wait.h> #include <linux/bitops.h> #include <linux/delay.h> #include <linux/module.h> #include <linux/ratelimit.h> #include "tty.h" #define MIN_TTYB_SIZE 256 #define TTYB_ALIGN_MASK 0xff /* * Byte threshold to limit memory consumption for flip buffers. * The actual memory limit is > 2x this amount. */ #define TTYB_DEFAULT_MEM_LIMIT (640 * 1024UL) /* * We default to dicing tty buffer allocations to this many characters * in order to avoid multiple page allocations. We know the size of * tty_buffer itself but it must also be taken into account that the * buffer is 256 byte aligned. See tty_buffer_find for the allocation * logic this must match. */ #define TTY_BUFFER_PAGE (((PAGE_SIZE - sizeof(struct tty_buffer)) / 2) & ~TTYB_ALIGN_MASK) /** * tty_buffer_lock_exclusive - gain exclusive access to buffer * @port: tty port owning the flip buffer * * Guarantees safe use of the &tty_ldisc_ops.receive_buf() method by excluding * the buffer work and any pending flush from using the flip buffer. Data can * continue to be added concurrently to the flip buffer from the driver side. * * See also tty_buffer_unlock_exclusive(). */ void tty_buffer_lock_exclusive(struct tty_port *port) { struct tty_bufhead *buf = &port->buf; atomic_inc(&buf->priority); mutex_lock(&buf->lock); } EXPORT_SYMBOL_GPL(tty_buffer_lock_exclusive); /** * tty_buffer_unlock_exclusive - release exclusive access * @port: tty port owning the flip buffer * * The buffer work is restarted if there is data in the flip buffer. * * See also tty_buffer_lock_exclusive(). */ void tty_buffer_unlock_exclusive(struct tty_port *port) { struct tty_bufhead *buf = &port->buf; bool restart = buf->head->commit != buf->head->read; atomic_dec(&buf->priority); mutex_unlock(&buf->lock); if (restart) queue_work(system_unbound_wq, &buf->work); } EXPORT_SYMBOL_GPL(tty_buffer_unlock_exclusive); /** * tty_buffer_space_avail - return unused buffer space * @port: tty port owning the flip buffer * * Returns: the # of bytes which can be written by the driver without reaching * the buffer limit. * * Note: this does not guarantee that memory is available to write the returned * # of bytes (use tty_prepare_flip_string() to pre-allocate if memory * guarantee is required). */ unsigned int tty_buffer_space_avail(struct tty_port *port) { int space = port->buf.mem_limit - atomic_read(&port->buf.mem_used); return max(space, 0); } EXPORT_SYMBOL_GPL(tty_buffer_space_avail); static void tty_buffer_reset(struct tty_buffer *p, size_t size) { p->used = 0; p->size = size; p->next = NULL; p->commit = 0; p->lookahead = 0; p->read = 0; p->flags = true; } /** * tty_buffer_free_all - free buffers used by a tty * @port: tty port to free from * * Remove all the buffers pending on a tty whether queued with data or in the * free ring. Must be called when the tty is no longer in use. */ void tty_buffer_free_all(struct tty_port *port) { struct tty_bufhead *buf = &port->buf; struct tty_buffer *p, *next; struct llist_node *llist; unsigned int freed = 0; int still_used; while ((p = buf->head) != NULL) { buf->head = p->next; freed += p->size; if (p->size > 0) kfree(p); } llist = llist_del_all(&buf->free); llist_for_each_entry_safe(p, next, llist, free) kfree(p); tty_buffer_reset(&buf->sentinel, 0); buf->head = &buf->sentinel; buf->tail = &buf->sentinel; still_used = atomic_xchg(&buf->mem_used, 0); WARN(still_used != freed, "we still have not freed %d bytes!", still_used - freed); } /** * tty_buffer_alloc - allocate a tty buffer * @port: tty port * @size: desired size (characters) * * Allocate a new tty buffer to hold the desired number of characters. We * round our buffers off in 256 character chunks to get better allocation * behaviour. * * Returns: %NULL if out of memory or the allocation would exceed the per * device queue. */ static struct tty_buffer *tty_buffer_alloc(struct tty_port *port, size_t size) { struct llist_node *free; struct tty_buffer *p; /* Round the buffer size out */ size = __ALIGN_MASK(size, TTYB_ALIGN_MASK); if (size <= MIN_TTYB_SIZE) { free = llist_del_first(&port->buf.free); if (free) { p = llist_entry(free, struct tty_buffer, free); goto found; } } /* Should possibly check if this fails for the largest buffer we * have queued and recycle that ? */ if (atomic_read(&port->buf.mem_used) > port->buf.mem_limit) return NULL; p = kmalloc(struct_size(p, data, 2 * size), GFP_ATOMIC | __GFP_NOWARN); if (p == NULL) return NULL; found: tty_buffer_reset(p, size); atomic_add(size, &port->buf.mem_used); return p; } /** * tty_buffer_free - free a tty buffer * @port: tty port owning the buffer * @b: the buffer to free * * Free a tty buffer, or add it to the free list according to our internal * strategy. */ static void tty_buffer_free(struct tty_port *port, struct tty_buffer *b) { struct tty_bufhead *buf = &port->buf; /* Dumb strategy for now - should keep some stats */ WARN_ON(atomic_sub_return(b->size, &buf->mem_used) < 0); if (b->size > MIN_TTYB_SIZE) kfree(b); else if (b->size > 0) llist_add(&b->free, &buf->free); } /** * tty_buffer_flush - flush full tty buffers * @tty: tty to flush * @ld: optional ldisc ptr (must be referenced) * * Flush all the buffers containing receive data. If @ld != %NULL, flush the * ldisc input buffer. * * Locking: takes buffer lock to ensure single-threaded flip buffer 'consumer'. */ void tty_buffer_flush(struct tty_struct *tty, struct tty_ldisc *ld) { struct tty_port *port = tty->port; struct tty_bufhead *buf = &port->buf; struct tty_buffer *next; atomic_inc(&buf->priority); mutex_lock(&buf->lock); /* paired w/ release in __tty_buffer_request_room; ensures there are * no pending memory accesses to the freed buffer */ while ((next = smp_load_acquire(&buf->head->next)) != NULL) { tty_buffer_free(port, buf->head); buf->head = next; } buf->head->read = buf->head->commit; buf->head->lookahead = buf->head->read; if (ld && ld->ops->flush_buffer) ld->ops->flush_buffer(tty); atomic_dec(&buf->priority); mutex_unlock(&buf->lock); } /** * __tty_buffer_request_room - grow tty buffer if needed * @port: tty port * @size: size desired * @flags: buffer has to store flags along character data * * Make at least @size bytes of linear space available for the tty buffer. * * Will change over to a new buffer if the current buffer is encoded as * %TTY_NORMAL (so has no flags buffer) and the new buffer requires a flags * buffer. * * Returns: the size we managed to find. */ static int __tty_buffer_request_room(struct tty_port *port, size_t size, bool flags) { struct tty_bufhead *buf = &port->buf; struct tty_buffer *n, *b = buf->tail; size_t left = (b->flags ? 1 : 2) * b->size - b->used; bool change = !b->flags && flags; if (!change && left >= size) return size; /* This is the slow path - looking for new buffers to use */ n = tty_buffer_alloc(port, size); if (n == NULL) return change ? 0 : left; n->flags = flags; buf->tail = n; /* * Paired w/ acquire in flush_to_ldisc() and lookahead_bufs() * ensures they see all buffer data. */ smp_store_release(&b->commit, b->used); /* * Paired w/ acquire in flush_to_ldisc() and lookahead_bufs() * ensures the latest commit value can be read before the head * is advanced to the next buffer. */ smp_store_release(&b->next, n); return size; } int tty_buffer_request_room(struct tty_port *port, size_t size) { return __tty_buffer_request_room(port, size, true); } EXPORT_SYMBOL_GPL(tty_buffer_request_room); size_t __tty_insert_flip_string_flags(struct tty_port *port, const u8 *chars, const u8 *flags, bool mutable_flags, size_t size) { bool need_flags = mutable_flags || flags[0] != TTY_NORMAL; size_t copied = 0; do { size_t goal = min_t(size_t, size - copied, TTY_BUFFER_PAGE); size_t space = __tty_buffer_request_room(port, goal, need_flags); struct tty_buffer *tb = port->buf.tail; if (unlikely(space == 0)) break; memcpy(char_buf_ptr(tb, tb->used), chars, space); if (mutable_flags) { memcpy(flag_buf_ptr(tb, tb->used), flags, space); flags += space; } else if (tb->flags) { memset(flag_buf_ptr(tb, tb->used), flags[0], space); } else { /* tb->flags should be available once requested */ WARN_ON_ONCE(need_flags); } tb->used += space; copied += space; chars += space; /* There is a small chance that we need to split the data over * several buffers. If this is the case we must loop. */ } while (unlikely(size > copied)); return copied; } EXPORT_SYMBOL(__tty_insert_flip_string_flags); /** * tty_prepare_flip_string - make room for characters * @port: tty port * @chars: return pointer for character write area * @size: desired size * * Prepare a block of space in the buffer for data. * * This is used for drivers that need their own block copy routines into the * buffer. There is no guarantee the buffer is a DMA target! * * Returns: the length available and buffer pointer (@chars) to the space which * is now allocated and accounted for as ready for normal characters. */ size_t tty_prepare_flip_string(struct tty_port *port, u8 **chars, size_t size) { size_t space = __tty_buffer_request_room(port, size, false); if (likely(space)) { struct tty_buffer *tb = port->buf.tail; *chars = char_buf_ptr(tb, tb->used); if (tb->flags) memset(flag_buf_ptr(tb, tb->used), TTY_NORMAL, space); tb->used += space; } return space; } EXPORT_SYMBOL_GPL(tty_prepare_flip_string); /** * tty_ldisc_receive_buf - forward data to line discipline * @ld: line discipline to process input * @p: char buffer * @f: %TTY_NORMAL, %TTY_BREAK, etc. flags buffer * @count: number of bytes to process * * Callers other than flush_to_ldisc() need to exclude the kworker from * concurrent use of the line discipline, see paste_selection(). * * Returns: the number of bytes processed. */ size_t tty_ldisc_receive_buf(struct tty_ldisc *ld, const u8 *p, const u8 *f, size_t count) { if (ld->ops->receive_buf2) count = ld->ops->receive_buf2(ld->tty, p, f, count); else { count = min_t(size_t, count, ld->tty->receive_room); if (count && ld->ops->receive_buf) ld->ops->receive_buf(ld->tty, p, f, count); } return count; } EXPORT_SYMBOL_GPL(tty_ldisc_receive_buf); static void lookahead_bufs(struct tty_port *port, struct tty_buffer *head) { head->lookahead = max(head->lookahead, head->read); while (head) { struct tty_buffer *next; unsigned int count; /* * Paired w/ release in __tty_buffer_request_room(); * ensures commit value read is not stale if the head * is advancing to the next buffer. */ next = smp_load_acquire(&head->next); /* * Paired w/ release in __tty_buffer_request_room() or in * tty_buffer_flush(); ensures we see the committed buffer data. */ count = smp_load_acquire(&head->commit) - head->lookahead; if (!count) { head = next; continue; } if (port->client_ops->lookahead_buf) { u8 *p, *f = NULL; p = char_buf_ptr(head, head->lookahead); if (head->flags) f = flag_buf_ptr(head, head->lookahead); port->client_ops->lookahead_buf(port, p, f, count); } head->lookahead += count; } } static size_t receive_buf(struct tty_port *port, struct tty_buffer *head, size_t count) { u8 *p = char_buf_ptr(head, head->read); const u8 *f = NULL; size_t n; if (head->flags) f = flag_buf_ptr(head, head->read); n = port->client_ops->receive_buf(port, p, f, count); if (n > 0) memset(p, 0, n); return n; } /** * flush_to_ldisc - flush data from buffer to ldisc * @work: tty structure passed from work queue. * * This routine is called out of the software interrupt to flush data from the * buffer chain to the line discipline. * * The receive_buf() method is single threaded for each tty instance. * * Locking: takes buffer lock to ensure single-threaded flip buffer 'consumer'. */ static void flush_to_ldisc(struct work_struct *work) { struct tty_port *port = container_of(work, struct tty_port, buf.work); struct tty_bufhead *buf = &port->buf; mutex_lock(&buf->lock); while (1) { struct tty_buffer *head = buf->head; struct tty_buffer *next; size_t count, rcvd; /* Ldisc or user is trying to gain exclusive access */ if (atomic_read(&buf->priority)) break; /* paired w/ release in __tty_buffer_request_room(); * ensures commit value read is not stale if the head * is advancing to the next buffer */ next = smp_load_acquire(&head->next); /* paired w/ release in __tty_buffer_request_room() or in * tty_buffer_flush(); ensures we see the committed buffer data */ count = smp_load_acquire(&head->commit) - head->read; if (!count) { if (next == NULL) break; buf->head = next; tty_buffer_free(port, head); continue; } rcvd = receive_buf(port, head, count); head->read += rcvd; if (rcvd < count) lookahead_bufs(port, head); if (!rcvd) break; cond_resched(); } mutex_unlock(&buf->lock); } static inline void tty_flip_buffer_commit(struct tty_buffer *tail) { /* * Paired w/ acquire in flush_to_ldisc(); ensures flush_to_ldisc() sees * buffer data. */ smp_store_release(&tail->commit, tail->used); } /** * tty_flip_buffer_push - push terminal buffers * @port: tty port to push * * Queue a push of the terminal flip buffers to the line discipline. Can be * called from IRQ/atomic context. * * In the event of the queue being busy for flipping the work will be held off * and retried later. */ void tty_flip_buffer_push(struct tty_port *port) { struct tty_bufhead *buf = &port->buf; tty_flip_buffer_commit(buf->tail); queue_work(system_unbound_wq, &buf->work); } EXPORT_SYMBOL(tty_flip_buffer_push); /** * tty_insert_flip_string_and_push_buffer - add characters to the tty buffer and * push * @port: tty port * @chars: characters * @size: size * * The function combines tty_insert_flip_string() and tty_flip_buffer_push() * with the exception of properly holding the @port->lock. * * To be used only internally (by pty currently). * * Returns: the number added. */ int tty_insert_flip_string_and_push_buffer(struct tty_port *port, const u8 *chars, size_t size) { struct tty_bufhead *buf = &port->buf; unsigned long flags; spin_lock_irqsave(&port->lock, flags); size = tty_insert_flip_string(port, chars, size); if (size) tty_flip_buffer_commit(buf->tail); spin_unlock_irqrestore(&port->lock, flags); queue_work(system_unbound_wq, &buf->work); return size; } /** * tty_buffer_init - prepare a tty buffer structure * @port: tty port to initialise * * Set up the initial state of the buffer management for a tty device. Must be * called before the other tty buffer functions are used. */ void tty_buffer_init(struct tty_port *port) { struct tty_bufhead *buf = &port->buf; mutex_init(&buf->lock); tty_buffer_reset(&buf->sentinel, 0); buf->head = &buf->sentinel; buf->tail = &buf->sentinel; init_llist_head(&buf->free); atomic_set(&buf->mem_used, 0); atomic_set(&buf->priority, 0); INIT_WORK(&buf->work, flush_to_ldisc); buf->mem_limit = TTYB_DEFAULT_MEM_LIMIT; } /** * tty_buffer_set_limit - change the tty buffer memory limit * @port: tty port to change * @limit: memory limit to set * * Change the tty buffer memory limit. * * Must be called before the other tty buffer functions are used. */ int tty_buffer_set_limit(struct tty_port *port, int limit) { if (limit < MIN_TTYB_SIZE) return -EINVAL; port->buf.mem_limit = limit; return 0; } EXPORT_SYMBOL_GPL(tty_buffer_set_limit); /* slave ptys can claim nested buffer lock when handling BRK and INTR */ void tty_buffer_set_lock_subclass(struct tty_port *port) { lockdep_set_subclass(&port->buf.lock, TTY_LOCK_SLAVE); } bool tty_buffer_restart_work(struct tty_port *port) { return queue_work(system_unbound_wq, &port->buf.work); } bool tty_buffer_cancel_work(struct tty_port *port) { return cancel_work_sync(&port->buf.work); } void tty_buffer_flush_work(struct tty_port *port) { flush_work(&port->buf.work); }
7 7 6 1 7 1 1 1 1 7 4 3 2 1 2 1 1 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 // SPDX-License-Identifier: GPL-2.0-or-later /* * USB HID quirks support for Linux * * Copyright (c) 1999 Andreas Gal * Copyright (c) 2000-2005 Vojtech Pavlik <vojtech@suse.cz> * Copyright (c) 2005 Michael Haboustak <mike-@cinci.rr.com> for Concept2, Inc * Copyright (c) 2006-2007 Jiri Kosina * Copyright (c) 2008 Jiri Slaby <jirislaby@gmail.com> * Copyright (c) 2019 Paul Pawlowski <paul@mrarm.io> * Copyright (c) 2023 Orlando Chamberlain <orlandoch.dev@gmail.com> * Copyright (c) 2024 Aditya Garg <gargaditya08@live.com> */ /* */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/device.h> #include <linux/hid.h> #include <linux/jiffies.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/leds.h> #include <dt-bindings/leds/common.h> #include "hid-ids.h" #define APPLE_RDESC_JIS BIT(0) /* BIT(1) reserved, was: APPLE_IGNORE_MOUSE */ #define APPLE_HAS_FN BIT(2) /* BIT(3) reserved, was: APPLE_HIDDEV */ #define APPLE_ISO_TILDE_QUIRK BIT(4) #define APPLE_MIGHTYMOUSE BIT(5) #define APPLE_INVERT_HWHEEL BIT(6) /* BIT(7) reserved, was: APPLE_IGNORE_HIDINPUT */ #define APPLE_NUMLOCK_EMULATION BIT(8) #define APPLE_RDESC_BATTERY BIT(9) #define APPLE_BACKLIGHT_CTL BIT(10) #define APPLE_IS_NON_APPLE BIT(11) #define APPLE_MAGIC_BACKLIGHT BIT(12) #define APPLE_DISABLE_FKEYS BIT(13) #define APPLE_FLAG_FKEY BIT(0) #define APPLE_FLAG_TB_FKEY BIT(1) #define HID_COUNTRY_INTERNATIONAL_ISO 13 #define APPLE_BATTERY_TIMEOUT_SEC 60 #define HID_USAGE_MAGIC_BL 0xff00000f #define APPLE_MAGIC_REPORT_ID_POWER 3 #define APPLE_MAGIC_REPORT_ID_BRIGHTNESS 1 static unsigned int fnmode = 3; module_param(fnmode, uint, 0644); MODULE_PARM_DESC(fnmode, "Mode of fn key on Apple keyboards (0 = disabled, " "1 = fkeyslast, 2 = fkeysfirst, [3] = auto, 4 = fkeysdisabled)"); static int iso_layout = -1; module_param(iso_layout, int, 0644); MODULE_PARM_DESC(iso_layout, "Swap the backtick/tilde and greater-than/less-than keys. " "([-1] = auto, 0 = disabled, 1 = enabled)"); static unsigned int swap_opt_cmd; module_param(swap_opt_cmd, uint, 0644); MODULE_PARM_DESC(swap_opt_cmd, "Swap the Option (\"Alt\") and Command (\"Flag\") keys. " "(For people who want to keep Windows PC keyboard muscle memory. " "[0] = as-is, Mac layout. 1 = swapped, Windows layout., 2 = swapped, Swap only left side)"); static unsigned int swap_ctrl_cmd; module_param(swap_ctrl_cmd, uint, 0644); MODULE_PARM_DESC(swap_ctrl_cmd, "Swap the Control (\"Ctrl\") and Command (\"Flag\") keys. " "(For people who are used to Mac shortcuts involving Command instead of Control. " "[0] = No change. 1 = Swapped.)"); static unsigned int swap_fn_leftctrl; module_param(swap_fn_leftctrl, uint, 0644); MODULE_PARM_DESC(swap_fn_leftctrl, "Swap the Fn and left Control keys. " "(For people who want to keep PC keyboard muscle memory. " "[0] = as-is, Mac layout, 1 = swapped, PC layout)"); struct apple_non_apple_keyboard { char *name; }; struct apple_sc_backlight { struct led_classdev cdev; struct hid_device *hdev; }; struct apple_backlight_config_report { u8 report_id; u8 version; u16 backlight_off, backlight_on_min, backlight_on_max; }; struct apple_backlight_set_report { u8 report_id; u8 version; u16 backlight; u16 rate; }; struct apple_magic_backlight { struct led_classdev cdev; struct hid_report *brightness; struct hid_report *power; }; struct apple_sc { struct hid_device *hdev; unsigned long quirks; unsigned int fn_on; unsigned int fn_found; DECLARE_BITMAP(pressed_numlock, KEY_CNT); struct timer_list battery_timer; struct apple_sc_backlight *backlight; }; struct apple_key_translation { u16 from; u16 to; unsigned long flags; }; static const struct apple_key_translation magic_keyboard_alu_fn_keys[] = { { KEY_BACKSPACE, KEY_DELETE }, { KEY_ENTER, KEY_INSERT }, { KEY_F1, KEY_BRIGHTNESSDOWN, APPLE_FLAG_FKEY }, { KEY_F2, KEY_BRIGHTNESSUP, APPLE_FLAG_FKEY }, { KEY_F3, KEY_SCALE, APPLE_FLAG_FKEY }, { KEY_F4, KEY_DASHBOARD, APPLE_FLAG_FKEY }, { KEY_F6, KEY_NUMLOCK, APPLE_FLAG_FKEY }, { KEY_F7, KEY_PREVIOUSSONG, APPLE_FLAG_FKEY }, { KEY_F8, KEY_PLAYPAUSE, APPLE_FLAG_FKEY }, { KEY_F9, KEY_NEXTSONG, APPLE_FLAG_FKEY }, { KEY_F10, KEY_MUTE, APPLE_FLAG_FKEY }, { KEY_F11, KEY_VOLUMEDOWN, APPLE_FLAG_FKEY }, { KEY_F12, KEY_VOLUMEUP, APPLE_FLAG_FKEY }, { KEY_UP, KEY_PAGEUP }, { KEY_DOWN, KEY_PAGEDOWN }, { KEY_LEFT, KEY_HOME }, { KEY_RIGHT, KEY_END }, { } }; static const struct apple_key_translation magic_keyboard_2015_fn_keys[] = { { KEY_BACKSPACE, KEY_DELETE }, { KEY_ENTER, KEY_INSERT }, { KEY_F1, KEY_BRIGHTNESSDOWN, APPLE_FLAG_FKEY }, { KEY_F2, KEY_BRIGHTNESSUP, APPLE_FLAG_FKEY }, { KEY_F3, KEY_SCALE, APPLE_FLAG_FKEY }, { KEY_F4, KEY_DASHBOARD, APPLE_FLAG_FKEY }, { KEY_F7, KEY_PREVIOUSSONG, APPLE_FLAG_FKEY }, { KEY_F8, KEY_PLAYPAUSE, APPLE_FLAG_FKEY }, { KEY_F9, KEY_NEXTSONG, APPLE_FLAG_FKEY }, { KEY_F10, KEY_MUTE, APPLE_FLAG_FKEY }, { KEY_F11, KEY_VOLUMEDOWN, APPLE_FLAG_FKEY }, { KEY_F12, KEY_VOLUMEUP, APPLE_FLAG_FKEY }, { KEY_UP, KEY_PAGEUP }, { KEY_DOWN, KEY_PAGEDOWN }, { KEY_LEFT, KEY_HOME }, { KEY_RIGHT, KEY_END }, { } }; static const struct apple_key_translation magic_keyboard_2021_and_2024_fn_keys[] = { { KEY_BACKSPACE, KEY_DELETE }, { KEY_ENTER, KEY_INSERT }, { KEY_F1, KEY_BRIGHTNESSDOWN, APPLE_FLAG_FKEY }, { KEY_F2, KEY_BRIGHTNESSUP, APPLE_FLAG_FKEY }, { KEY_F3, KEY_SCALE, APPLE_FLAG_FKEY }, { KEY_F4, KEY_SEARCH, APPLE_FLAG_FKEY }, { KEY_F5, KEY_MICMUTE, APPLE_FLAG_FKEY }, { KEY_F6, KEY_SLEEP, APPLE_FLAG_FKEY }, { KEY_F7, KEY_PREVIOUSSONG, APPLE_FLAG_FKEY }, { KEY_F8, KEY_PLAYPAUSE, APPLE_FLAG_FKEY }, { KEY_F9, KEY_NEXTSONG, APPLE_FLAG_FKEY }, { KEY_F10, KEY_MUTE, APPLE_FLAG_FKEY }, { KEY_F11, KEY_VOLUMEDOWN, APPLE_FLAG_FKEY }, { KEY_F12, KEY_VOLUMEUP, APPLE_FLAG_FKEY }, { KEY_UP, KEY_PAGEUP }, { KEY_DOWN, KEY_PAGEDOWN }, { KEY_LEFT, KEY_HOME }, { KEY_RIGHT, KEY_END }, { } }; static const struct apple_key_translation macbookair_fn_keys[] = { { KEY_BACKSPACE, KEY_DELETE }, { KEY_ENTER, KEY_INSERT }, { KEY_F1, KEY_BRIGHTNESSDOWN, APPLE_FLAG_FKEY }, { KEY_F2, KEY_BRIGHTNESSUP, APPLE_FLAG_FKEY }, { KEY_F3, KEY_SCALE, APPLE_FLAG_FKEY }, { KEY_F4, KEY_DASHBOARD, APPLE_FLAG_FKEY }, { KEY_F6, KEY_PREVIOUSSONG, APPLE_FLAG_FKEY }, { KEY_F7, KEY_PLAYPAUSE, APPLE_FLAG_FKEY }, { KEY_F8, KEY_NEXTSONG, APPLE_FLAG_FKEY }, { KEY_F9, KEY_MUTE, APPLE_FLAG_FKEY }, { KEY_F10, KEY_VOLUMEDOWN, APPLE_FLAG_FKEY }, { KEY_F11, KEY_VOLUMEUP, APPLE_FLAG_FKEY }, { KEY_F12, KEY_EJECTCD, APPLE_FLAG_FKEY }, { KEY_UP, KEY_PAGEUP }, { KEY_DOWN, KEY_PAGEDOWN }, { KEY_LEFT,