32 136 18 15 2 87 95 7 4 3 3 7 5 3 11 13 2 2 2 2 13 13 13 5 5 5 5 5 11 11 1 10 11 2 2 11 10 1 1 10 2 4 13 8 8 8 2 1 2 2 2 2 2 2 11 9 2 2 2 2 1 1 7 7 7 7 7 7 1 7 7 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 3 2 3 3 3 3 3 2 3 1 3 1 3 3 42 42 42 3 2 1 2 42 44 41 2 40 5 5 5 4 5 4 2 2 2 2 2 2 2 2 2 2 2 2 4 4 4 7 7 7 7 1 7 7 7 7 7 7 7 4 4 2 3 3 4 4 4 4 10 10 1 1 10 10 10 1 1 1 1 62 62 1 1 1 62 62 62 62 1 1 1 1 26 40 7 6 120 116 7 115 116 34 31 4 59 59 2 1 6 6 6 6 6 100 99 6 6 6 8 8 8 108 40 40 86 81 8 46 45 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2011 STRATO. All rights reserved. */ #include <linux/sched.h> #include <linux/pagemap.h> #include <linux/writeback.h> #include <linux/blkdev.h> #include <linux/rbtree.h> #include <linux/slab.h> #include <linux/workqueue.h> #include <linux/btrfs.h> #include <linux/sched/mm.h> #include "ctree.h" #include "transaction.h" #include "disk-io.h" #include "locking.h" #include "ulist.h" #include "backref.h" #include "extent_io.h" #include "qgroup.h" #include "block-group.h" #include "sysfs.h" #include "tree-mod-log.h" #include "fs.h" #include "accessors.h" #include "extent-tree.h" #include "root-tree.h" #include "tree-checker.h" enum btrfs_qgroup_mode btrfs_qgroup_mode(struct btrfs_fs_info *fs_info) { if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) return BTRFS_QGROUP_MODE_DISABLED; if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE) return BTRFS_QGROUP_MODE_SIMPLE; return BTRFS_QGROUP_MODE_FULL; } bool btrfs_qgroup_enabled(struct btrfs_fs_info *fs_info) { return btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_DISABLED; } bool btrfs_qgroup_full_accounting(struct btrfs_fs_info *fs_info) { return btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL; } /* * Helpers to access qgroup reservation * * Callers should ensure the lock context and type are valid */ static u64 qgroup_rsv_total(const struct btrfs_qgroup *qgroup) { u64 ret = 0; int i; for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) ret += qgroup->rsv.values[i]; return ret; } #ifdef CONFIG_BTRFS_DEBUG static const char *qgroup_rsv_type_str(enum btrfs_qgroup_rsv_type type) { if (type == BTRFS_QGROUP_RSV_DATA) return "data"; if (type == BTRFS_QGROUP_RSV_META_PERTRANS) return "meta_pertrans"; if (type == BTRFS_QGROUP_RSV_META_PREALLOC) return "meta_prealloc"; return NULL; } #endif static void qgroup_rsv_add(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup, u64 num_bytes, enum btrfs_qgroup_rsv_type type) { trace_qgroup_update_reserve(fs_info, qgroup, num_bytes, type); qgroup->rsv.values[type] += num_bytes; } static void qgroup_rsv_release(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup, u64 num_bytes, enum btrfs_qgroup_rsv_type type) { trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes, type); if (qgroup->rsv.values[type] >= num_bytes) { qgroup->rsv.values[type] -= num_bytes; return; } #ifdef CONFIG_BTRFS_DEBUG WARN_RATELIMIT(1, "qgroup %llu %s reserved space underflow, have %llu to free %llu", qgroup->qgroupid, qgroup_rsv_type_str(type), qgroup->rsv.values[type], num_bytes); #endif qgroup->rsv.values[type] = 0; } static void qgroup_rsv_add_by_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *dest, struct btrfs_qgroup *src) { int i; for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) qgroup_rsv_add(fs_info, dest, src->rsv.values[i], i); } static void qgroup_rsv_release_by_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *dest, struct btrfs_qgroup *src) { int i; for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) qgroup_rsv_release(fs_info, dest, src->rsv.values[i], i); } static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq, int mod) { if (qg->old_refcnt < seq) qg->old_refcnt = seq; qg->old_refcnt += mod; } static void btrfs_qgroup_update_new_refcnt(struct btrfs_qgroup *qg, u64 seq, int mod) { if (qg->new_refcnt < seq) qg->new_refcnt = seq; qg->new_refcnt += mod; } static inline u64 btrfs_qgroup_get_old_refcnt(struct btrfs_qgroup *qg, u64 seq) { if (qg->old_refcnt < seq) return 0; return qg->old_refcnt - seq; } static inline u64 btrfs_qgroup_get_new_refcnt(struct btrfs_qgroup *qg, u64 seq) { if (qg->new_refcnt < seq) return 0; return qg->new_refcnt - seq; } /* * glue structure to represent the relations between qgroups. */ struct btrfs_qgroup_list { struct list_head next_group; struct list_head next_member; struct btrfs_qgroup *group; struct btrfs_qgroup *member; }; static int qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, int init_flags); static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info); /* must be called with qgroup_ioctl_lock held */ static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid) { struct rb_node *n = fs_info->qgroup_tree.rb_node; struct btrfs_qgroup *qgroup; while (n) { qgroup = rb_entry(n, struct btrfs_qgroup, node); if (qgroup->qgroupid < qgroupid) n = n->rb_left; else if (qgroup->qgroupid > qgroupid) n = n->rb_right; else return qgroup; } return NULL; } /* * Add qgroup to the filesystem's qgroup tree. * * Must be called with qgroup_lock held and @prealloc preallocated. * * The control on the lifespan of @prealloc would be transferred to this * function, thus caller should no longer touch @prealloc. */ static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *prealloc, u64 qgroupid) { struct rb_node **p = &fs_info->qgroup_tree.rb_node; struct rb_node *parent = NULL; struct btrfs_qgroup *qgroup; /* Caller must have pre-allocated @prealloc. */ ASSERT(prealloc); while (*p) { parent = *p; qgroup = rb_entry(parent, struct btrfs_qgroup, node); if (qgroup->qgroupid < qgroupid) { p = &(*p)->rb_left; } else if (qgroup->qgroupid > qgroupid) { p = &(*p)->rb_right; } else { kfree(prealloc); return qgroup; } } qgroup = prealloc; qgroup->qgroupid = qgroupid; INIT_LIST_HEAD(&qgroup->groups); INIT_LIST_HEAD(&qgroup->members); INIT_LIST_HEAD(&qgroup->dirty); INIT_LIST_HEAD(&qgroup->iterator); INIT_LIST_HEAD(&qgroup->nested_iterator); rb_link_node(&qgroup->node, parent, p); rb_insert_color(&qgroup->node, &fs_info->qgroup_tree); return qgroup; } static void __del_qgroup_rb(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup) { struct btrfs_qgroup_list *list; list_del(&qgroup->dirty); while (!list_empty(&qgroup->groups)) { list = list_first_entry(&qgroup->groups, struct btrfs_qgroup_list, next_group); list_del(&list->next_group); list_del(&list->next_member); kfree(list); } while (!list_empty(&qgroup->members)) { list = list_first_entry(&qgroup->members, struct btrfs_qgroup_list, next_member); list_del(&list->next_group); list_del(&list->next_member); kfree(list); } } /* must be called with qgroup_lock held */ static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid) { struct btrfs_qgroup *qgroup = find_qgroup_rb(fs_info, qgroupid); if (!qgroup) return -ENOENT; rb_erase(&qgroup->node, &fs_info->qgroup_tree); __del_qgroup_rb(fs_info, qgroup); return 0; } /* * Add relation specified by two qgroups. * * Must be called with qgroup_lock held, the ownership of @prealloc is * transferred to this function and caller should not touch it anymore. * * Return: 0 on success * -ENOENT if one of the qgroups is NULL * <0 other errors */ static int __add_relation_rb(struct btrfs_qgroup_list *prealloc, struct btrfs_qgroup *member, struct btrfs_qgroup *parent) { if (!member || !parent) { kfree(prealloc); return -ENOENT; } prealloc->group = parent; prealloc->member = member; list_add_tail(&prealloc->next_group, &member->groups); list_add_tail(&prealloc->next_member, &parent->members); return 0; } /* * Add relation specified by two qgroup ids. * * Must be called with qgroup_lock held. * * Return: 0 on success * -ENOENT if one of the ids does not exist * <0 other errors */ static int add_relation_rb(struct btrfs_fs_info *fs_info, struct btrfs_qgroup_list *prealloc, u64 memberid, u64 parentid) { struct btrfs_qgroup *member; struct btrfs_qgroup *parent; member = find_qgroup_rb(fs_info, memberid); parent = find_qgroup_rb(fs_info, parentid); return __add_relation_rb(prealloc, member, parent); } /* Must be called with qgroup_lock held */ static int del_relation_rb(struct btrfs_fs_info *fs_info, u64 memberid, u64 parentid) { struct btrfs_qgroup *member; struct btrfs_qgroup *parent; struct btrfs_qgroup_list *list; member = find_qgroup_rb(fs_info, memberid); parent = find_qgroup_rb(fs_info, parentid); if (!member || !parent) return -ENOENT; list_for_each_entry(list, &member->groups, next_group) { if (list->group == parent) { list_del(&list->next_group); list_del(&list->next_member); kfree(list); return 0; } } return -ENOENT; } #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, u64 rfer, u64 excl) { struct btrfs_qgroup *qgroup; qgroup = find_qgroup_rb(fs_info, qgroupid); if (!qgroup) return -EINVAL; if (qgroup->rfer != rfer || qgroup->excl != excl) return -EINVAL; return 0; } #endif static void qgroup_mark_inconsistent(struct btrfs_fs_info *fs_info) { if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) return; fs_info->qgroup_flags |= (BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT | BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN | BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING); } static void qgroup_read_enable_gen(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, int slot, struct btrfs_qgroup_status_item *ptr) { ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)); ASSERT(btrfs_item_size(leaf, slot) >= sizeof(*ptr)); fs_info->qgroup_enable_gen = btrfs_qgroup_status_enable_gen(leaf, ptr); } /* * The full config is read in one go, only called from open_ctree() * It doesn't use any locking, as at this point we're still single-threaded */ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) { struct btrfs_key key; struct btrfs_key found_key; struct btrfs_root *quota_root = fs_info->quota_root; struct btrfs_path *path = NULL; struct extent_buffer *l; int slot; int ret = 0; u64 flags = 0; u64 rescan_progress = 0; if (!fs_info->quota_root) return 0; fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL); if (!fs_info->qgroup_ulist) { ret = -ENOMEM; goto out; } path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out; } ret = btrfs_sysfs_add_qgroups(fs_info); if (ret < 0) goto out; /* default this to quota off, in case no status key is found */ fs_info->qgroup_flags = 0; /* * pass 1: read status, all qgroup infos and limits */ key.objectid = 0; key.type = 0; key.offset = 0; ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 1); if (ret) goto out; while (1) { struct btrfs_qgroup *qgroup; slot = path->slots[0]; l = path->nodes[0]; btrfs_item_key_to_cpu(l, &found_key, slot); if (found_key.type == BTRFS_QGROUP_STATUS_KEY) { struct btrfs_qgroup_status_item *ptr; ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item); if (btrfs_qgroup_status_version(l, ptr) != BTRFS_QGROUP_STATUS_VERSION) { btrfs_err(fs_info, "old qgroup version, quota disabled"); goto out; } fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, ptr); if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE) { qgroup_read_enable_gen(fs_info, l, slot, ptr); } else if (btrfs_qgroup_status_generation(l, ptr) != fs_info->generation) { qgroup_mark_inconsistent(fs_info); btrfs_err(fs_info, "qgroup generation mismatch, marked as inconsistent"); } rescan_progress = btrfs_qgroup_status_rescan(l, ptr); goto next1; } if (found_key.type != BTRFS_QGROUP_INFO_KEY && found_key.type != BTRFS_QGROUP_LIMIT_KEY) goto next1; qgroup = find_qgroup_rb(fs_info, found_key.offset); if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) || (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) { btrfs_err(fs_info, "inconsistent qgroup config"); qgroup_mark_inconsistent(fs_info); } if (!qgroup) { struct btrfs_qgroup *prealloc; prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL); if (!prealloc) { ret = -ENOMEM; goto out; } qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset); } ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); if (ret < 0) goto out; switch (found_key.type) { case BTRFS_QGROUP_INFO_KEY: { struct btrfs_qgroup_info_item *ptr; ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_info_item); qgroup->rfer = btrfs_qgroup_info_rfer(l, ptr); qgroup->rfer_cmpr = btrfs_qgroup_info_rfer_cmpr(l, ptr); qgroup->excl = btrfs_qgroup_info_excl(l, ptr); qgroup->excl_cmpr = btrfs_qgroup_info_excl_cmpr(l, ptr); /* generation currently unused */ break; } case BTRFS_QGROUP_LIMIT_KEY: { struct btrfs_qgroup_limit_item *ptr; ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item); qgroup->lim_flags = btrfs_qgroup_limit_flags(l, ptr); qgroup->max_rfer = btrfs_qgroup_limit_max_rfer(l, ptr); qgroup->max_excl = btrfs_qgroup_limit_max_excl(l, ptr); qgroup->rsv_rfer = btrfs_qgroup_limit_rsv_rfer(l, ptr); qgroup->rsv_excl = btrfs_qgroup_limit_rsv_excl(l, ptr); break; } } next1: ret = btrfs_next_item(quota_root, path); if (ret < 0) goto out; if (ret) break; } btrfs_release_path(path); /* * pass 2: read all qgroup relations */ key.objectid = 0; key.type = BTRFS_QGROUP_RELATION_KEY; key.offset = 0; ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 0); if (ret) goto out; while (1) { struct btrfs_qgroup_list *list = NULL; slot = path->slots[0]; l = path->nodes[0]; btrfs_item_key_to_cpu(l, &found_key, slot); if (found_key.type != BTRFS_QGROUP_RELATION_KEY) goto next2; if (found_key.objectid > found_key.offset) { /* parent <- member, not needed to build config */ /* FIXME should we omit the key completely? */ goto next2; } list = kzalloc(sizeof(*list), GFP_KERNEL); if (!list) { ret = -ENOMEM; goto out; } ret = add_relation_rb(fs_info, list, found_key.objectid, found_key.offset); list = NULL; if (ret == -ENOENT) { btrfs_warn(fs_info, "orphan qgroup relation 0x%llx->0x%llx", found_key.objectid, found_key.offset); ret = 0; /* ignore the error */ } if (ret) goto out; next2: ret = btrfs_next_item(quota_root, path); if (ret < 0) goto out; if (ret) break; } out: btrfs_free_path(path); fs_info->qgroup_flags |= flags; if (ret >= 0) { if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON) set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) ret = qgroup_rescan_init(fs_info, rescan_progress, 0); } else { ulist_free(fs_info->qgroup_ulist); fs_info->qgroup_ulist = NULL; fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; btrfs_sysfs_del_qgroups(fs_info); } return ret < 0 ? ret : 0; } /* * Called in close_ctree() when quota is still enabled. This verifies we don't * leak some reserved space. * * Return false if no reserved space is left. * Return true if some reserved space is leaked. */ bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info) { struct rb_node *node; bool ret = false; if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED) return ret; /* * Since we're unmounting, there is no race and no need to grab qgroup * lock. And here we don't go post-order to provide a more user * friendly sorted result. */ for (node = rb_first(&fs_info->qgroup_tree); node; node = rb_next(node)) { struct btrfs_qgroup *qgroup; int i; qgroup = rb_entry(node, struct btrfs_qgroup, node); for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) { if (qgroup->rsv.values[i]) { ret = true; btrfs_warn(fs_info, "qgroup %hu/%llu has unreleased space, type %d rsv %llu", btrfs_qgroup_level(qgroup->qgroupid), btrfs_qgroup_subvolid(qgroup->qgroupid), i, qgroup->rsv.values[i]); } } } return ret; } /* * This is called from close_ctree() or open_ctree() or btrfs_quota_disable(), * first two are in single-threaded paths.And for the third one, we have set * quota_root to be null with qgroup_lock held before, so it is safe to clean * up the in-memory structures without qgroup_lock held. */ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info) { struct rb_node *n; struct btrfs_qgroup *qgroup; while ((n = rb_first(&fs_info->qgroup_tree))) { qgroup = rb_entry(n, struct btrfs_qgroup, node); rb_erase(n, &fs_info->qgroup_tree); __del_qgroup_rb(fs_info, qgroup); btrfs_sysfs_del_one_qgroup(fs_info, qgroup); kfree(qgroup); } /* * We call btrfs_free_qgroup_config() when unmounting * filesystem and disabling quota, so we set qgroup_ulist * to be null here to avoid double free. */ ulist_free(fs_info->qgroup_ulist); fs_info->qgroup_ulist = NULL; btrfs_sysfs_del_qgroups(fs_info); } static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src, u64 dst) { int ret; struct btrfs_root *quota_root = trans->fs_info->quota_root; struct btrfs_path *path; struct btrfs_key key; path = btrfs_alloc_path(); if (!path) return -ENOMEM; key.objectid = src; key.type = BTRFS_QGROUP_RELATION_KEY; key.offset = dst; ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0); btrfs_mark_buffer_dirty(trans, path->nodes[0]); btrfs_free_path(path); return ret; } static int del_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src, u64 dst) { int ret; struct btrfs_root *quota_root = trans->fs_info->quota_root; struct btrfs_path *path; struct btrfs_key key; path = btrfs_alloc_path(); if (!path) return -ENOMEM; key.objectid = src; key.type = BTRFS_QGROUP_RELATION_KEY; key.offset = dst; ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1); if (ret < 0) goto out; if (ret > 0) { ret = -ENOENT; goto out; } ret = btrfs_del_item(trans, quota_root, path); out: btrfs_free_path(path); return ret; } static int add_qgroup_item(struct btrfs_trans_handle *trans, struct btrfs_root *quota_root, u64 qgroupid) { int ret; struct btrfs_path *path; struct btrfs_qgroup_info_item *qgroup_info; struct btrfs_qgroup_limit_item *qgroup_limit; struct extent_buffer *leaf; struct btrfs_key key; if (btrfs_is_testing(quota_root->fs_info)) return 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; key.objectid = 0; key.type = BTRFS_QGROUP_INFO_KEY; key.offset = qgroupid; /* * Avoid a transaction abort by catching -EEXIST here. In that * case, we proceed by re-initializing the existing structure * on disk. */ ret = btrfs_insert_empty_item(trans, quota_root, path, &key, sizeof(*qgroup_info)); if (ret && ret != -EEXIST) goto out; leaf = path->nodes[0]; qgroup_info = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_qgroup_info_item); btrfs_set_qgroup_info_generation(leaf, qgroup_info, trans->transid); btrfs_set_qgroup_info_rfer(leaf, qgroup_info, 0); btrfs_set_qgroup_info_rfer_cmpr(leaf, qgroup_info, 0); btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0); btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0); btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); key.type = BTRFS_QGROUP_LIMIT_KEY; ret = btrfs_insert_empty_item(trans, quota_root, path, &key, sizeof(*qgroup_limit)); if (ret && ret != -EEXIST) goto out; leaf = path->nodes[0]; qgroup_limit = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_qgroup_limit_item); btrfs_set_qgroup_limit_flags(leaf, qgroup_limit, 0); btrfs_set_qgroup_limit_max_rfer(leaf, qgroup_limit, 0); btrfs_set_qgroup_limit_max_excl(leaf, qgroup_limit, 0); btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0); btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0); btrfs_mark_buffer_dirty(trans, leaf); ret = 0; out: btrfs_free_path(path); return ret; } static int del_qgroup_item(struct btrfs_trans_handle *trans, u64 qgroupid) { int ret; struct btrfs_root *quota_root = trans->fs_info->quota_root; struct btrfs_path *path; struct btrfs_key key; path = btrfs_alloc_path(); if (!path) return -ENOMEM; key.objectid = 0; key.type = BTRFS_QGROUP_INFO_KEY; key.offset = qgroupid; ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1); if (ret < 0) goto out; if (ret > 0) { ret = -ENOENT; goto out; } ret = btrfs_del_item(trans, quota_root, path); if (ret) goto out; btrfs_release_path(path); key.type = BTRFS_QGROUP_LIMIT_KEY; ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1); if (ret < 0) goto out; if (ret > 0) { ret = -ENOENT; goto out; } ret = btrfs_del_item(trans, quota_root, path); out: btrfs_free_path(path); return ret; } static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, struct btrfs_qgroup *qgroup) { struct btrfs_root *quota_root = trans->fs_info->quota_root; struct btrfs_path *path; struct btrfs_key key; struct extent_buffer *l; struct btrfs_qgroup_limit_item *qgroup_limit; int ret; int slot; key.objectid = 0; key.type = BTRFS_QGROUP_LIMIT_KEY; key.offset = qgroup->qgroupid; path = btrfs_alloc_path(); if (!path) return -ENOMEM; ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1); if (ret > 0) ret = -ENOENT; if (ret) goto out; l = path->nodes[0]; slot = path->slots[0]; qgroup_limit = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item); btrfs_set_qgroup_limit_flags(l, qgroup_limit, qgroup->lim_flags); btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, qgroup->max_rfer); btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, qgroup->max_excl); btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer); btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl); btrfs_mark_buffer_dirty(trans, l); out: btrfs_free_path(path); return ret; } static int update_qgroup_info_item(struct btrfs_trans_handle *trans, struct btrfs_qgroup *qgroup) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *quota_root = fs_info->quota_root; struct btrfs_path *path; struct btrfs_key key; struct extent_buffer *l; struct btrfs_qgroup_info_item *qgroup_info; int ret; int slot; if (btrfs_is_testing(fs_info)) return 0; key.objectid = 0; key.type = BTRFS_QGROUP_INFO_KEY; key.offset = qgroup->qgroupid; path = btrfs_alloc_path(); if (!path) return -ENOMEM; ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1); if (ret > 0) ret = -ENOENT; if (ret) goto out; l = path->nodes[0]; slot = path->slots[0]; qgroup_info = btrfs_item_ptr(l, slot, struct btrfs_qgroup_info_item); btrfs_set_qgroup_info_generation(l, qgroup_info, trans->transid); btrfs_set_qgroup_info_rfer(l, qgroup_info, qgroup->rfer); btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr); btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl); btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr); btrfs_mark_buffer_dirty(trans, l); out: btrfs_free_path(path); return ret; } static int update_qgroup_status_item(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *quota_root = fs_info->quota_root; struct btrfs_path *path; struct btrfs_key key; struct extent_buffer *l; struct btrfs_qgroup_status_item *ptr; int ret; int slot; key.objectid = 0; key.type = BTRFS_QGROUP_STATUS_KEY; key.offset = 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1); if (ret > 0) ret = -ENOENT; if (ret) goto out; l = path->nodes[0]; slot = path->slots[0]; ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item); btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAGS_MASK); btrfs_set_qgroup_status_generation(l, ptr, trans->transid); btrfs_set_qgroup_status_rescan(l, ptr, fs_info->qgroup_rescan_progress.objectid); btrfs_mark_buffer_dirty(trans, l); out: btrfs_free_path(path); return ret; } /* * called with qgroup_lock held */ static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_path *path; struct btrfs_key key; struct extent_buffer *leaf = NULL; int ret; int nr = 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; key.objectid = 0; key.offset = 0; key.type = 0; while (1) { ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) goto out; leaf = path->nodes[0]; nr = btrfs_header_nritems(leaf); if (!nr) break; /* * delete the leaf one by one * since the whole tree is going * to be deleted. */ path->slots[0] = 0; ret = btrfs_del_items(trans, root, path, 0, nr); if (ret) goto out; btrfs_release_path(path); } ret = 0; out: btrfs_free_path(path); return ret; } int btrfs_quota_enable(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_quota_ctl_args *quota_ctl_args) { struct btrfs_root *quota_root; struct btrfs_root *tree_root = fs_info->tree_root; struct btrfs_path *path = NULL; struct btrfs_qgroup_status_item *ptr; struct extent_buffer *leaf; struct btrfs_key key; struct btrfs_key found_key; struct btrfs_qgroup *qgroup = NULL; struct btrfs_qgroup *prealloc = NULL; struct btrfs_trans_handle *trans = NULL; struct ulist *ulist = NULL; const bool simple = (quota_ctl_args->cmd == BTRFS_QUOTA_CTL_ENABLE_SIMPLE_QUOTA); int ret = 0; int slot; /* * We need to have subvol_sem write locked, to prevent races between * concurrent tasks trying to enable quotas, because we will unlock * and relock qgroup_ioctl_lock before setting fs_info->quota_root * and before setting BTRFS_FS_QUOTA_ENABLED. */ lockdep_assert_held_write(&fs_info->subvol_sem); if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { btrfs_err(fs_info, "qgroups are currently unsupported in extent tree v2"); return -EINVAL; } mutex_lock(&fs_info->qgroup_ioctl_lock); if (fs_info->quota_root) goto out; ulist = ulist_alloc(GFP_KERNEL); if (!ulist) { ret = -ENOMEM; goto out; } ret = btrfs_sysfs_add_qgroups(fs_info); if (ret < 0) goto out; /* * Unlock qgroup_ioctl_lock before starting the transaction. This is to * avoid lock acquisition inversion problems (reported by lockdep) between * qgroup_ioctl_lock and the vfs freeze semaphores, acquired when we * start a transaction. * After we started the transaction lock qgroup_ioctl_lock again and * check if someone else created the quota root in the meanwhile. If so, * just return success and release the transaction handle. * * Also we don't need to worry about someone else calling * btrfs_sysfs_add_qgroups() after we unlock and getting an error because * that function returns 0 (success) when the sysfs entries already exist. */ mutex_unlock(&fs_info->qgroup_ioctl_lock); /* * 1 for quota root item * 1 for BTRFS_QGROUP_STATUS item * * Yet we also need 2*n items for a QGROUP_INFO/QGROUP_LIMIT items * per subvolume. However those are not currently reserved since it * would be a lot of overkill. */ trans = btrfs_start_transaction(tree_root, 2); mutex_lock(&fs_info->qgroup_ioctl_lock); if (IS_ERR(trans)) { ret = PTR_ERR(trans); trans = NULL; goto out; } if (fs_info->quota_root) goto out; fs_info->qgroup_ulist = ulist; ulist = NULL; /* * initially create the quota tree */ quota_root = btrfs_create_tree(trans, BTRFS_QUOTA_TREE_OBJECTID); if (IS_ERR(quota_root)) { ret = PTR_ERR(quota_root); btrfs_abort_transaction(trans, ret); goto out; } path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; btrfs_abort_transaction(trans, ret); goto out_free_root; } key.objectid = 0; key.type = BTRFS_QGROUP_STATUS_KEY; key.offset = 0; ret = btrfs_insert_empty_item(trans, quota_root, path, &key, sizeof(*ptr)); if (ret) { btrfs_abort_transaction(trans, ret); goto out_free_path; } leaf = path->nodes[0]; ptr = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_qgroup_status_item); btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid); btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION); fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON; if (simple) { fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE; btrfs_set_qgroup_status_enable_gen(leaf, ptr, trans->transid); } else { fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; } btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAGS_MASK); btrfs_set_qgroup_status_rescan(leaf, ptr, 0); btrfs_mark_buffer_dirty(trans, leaf); key.objectid = 0; key.type = BTRFS_ROOT_REF_KEY; key.offset = 0; btrfs_release_path(path); ret = btrfs_search_slot_for_read(tree_root, &key, path, 1, 0); if (ret > 0) goto out_add_root; if (ret < 0) { btrfs_abort_transaction(trans, ret); goto out_free_path; } while (1) { slot = path->slots[0]; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, slot); if (found_key.type == BTRFS_ROOT_REF_KEY) { /* Release locks on tree_root before we access quota_root */ btrfs_release_path(path); /* We should not have a stray @prealloc pointer. */ ASSERT(prealloc == NULL); prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS); if (!prealloc) { ret = -ENOMEM; btrfs_abort_transaction(trans, ret); goto out_free_path; } ret = add_qgroup_item(trans, quota_root, found_key.offset); if (ret) { btrfs_abort_transaction(trans, ret); goto out_free_path; } qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset); prealloc = NULL; if (IS_ERR(qgroup)) { ret = PTR_ERR(qgroup); btrfs_abort_transaction(trans, ret); goto out_free_path; } ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); if (ret < 0) { btrfs_abort_transaction(trans, ret); goto out_free_path; } ret = btrfs_search_slot_for_read(tree_root, &found_key, path, 1, 0); if (ret < 0) { btrfs_abort_transaction(trans, ret); goto out_free_path; } if (ret > 0) { /* * Shouldn't happen, but in case it does we * don't need to do the btrfs_next_item, just * continue. */ continue; } } ret = btrfs_next_item(tree_root, path); if (ret < 0) { btrfs_abort_transaction(trans, ret); goto out_free_path; } if (ret) break; } out_add_root: btrfs_release_path(path); ret = add_qgroup_item(trans, quota_root, BTRFS_FS_TREE_OBJECTID); if (ret) { btrfs_abort_transaction(trans, ret); goto out_free_path; } ASSERT(prealloc == NULL); prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS); if (!prealloc) { ret = -ENOMEM; goto out_free_path; } qgroup = add_qgroup_rb(fs_info, prealloc, BTRFS_FS_TREE_OBJECTID); prealloc = NULL; ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); if (ret < 0) { btrfs_abort_transaction(trans, ret); goto out_free_path; } fs_info->qgroup_enable_gen = trans->transid; mutex_unlock(&fs_info->qgroup_ioctl_lock); /* * Commit the transaction while not holding qgroup_ioctl_lock, to avoid * a deadlock with tasks concurrently doing other qgroup operations, such * adding/removing qgroups or adding/deleting qgroup relations for example, * because all qgroup operations first start or join a transaction and then * lock the qgroup_ioctl_lock mutex. * We are safe from a concurrent task trying to enable quotas, by calling * this function, since we are serialized by fs_info->subvol_sem. */ ret = btrfs_commit_transaction(trans); trans = NULL; mutex_lock(&fs_info->qgroup_ioctl_lock); if (ret) goto out_free_path; /* * Set quota enabled flag after committing the transaction, to avoid * deadlocks on fs_info->qgroup_ioctl_lock with concurrent snapshot * creation. */ spin_lock(&fs_info->qgroup_lock); fs_info->quota_root = quota_root; set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); if (simple) btrfs_set_fs_incompat(fs_info, SIMPLE_QUOTA); spin_unlock(&fs_info->qgroup_lock); /* Skip rescan for simple qgroups. */ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) goto out_free_path; ret = qgroup_rescan_init(fs_info, 0, 1); if (!ret) { qgroup_rescan_zero_tracking(fs_info); fs_info->qgroup_rescan_running = true; btrfs_queue_work(fs_info->qgroup_rescan_workers, &fs_info->qgroup_rescan_work); } else { /* * We have set both BTRFS_FS_QUOTA_ENABLED and * BTRFS_QGROUP_STATUS_FLAG_ON, so we can only fail with * -EINPROGRESS. That can happen because someone started the * rescan worker by calling quota rescan ioctl before we * attempted to initialize the rescan worker. Failure due to * quotas disabled in the meanwhile is not possible, because * we are holding a write lock on fs_info->subvol_sem, which * is also acquired when disabling quotas. * Ignore such error, and any other error would need to undo * everything we did in the transaction we just committed. */ ASSERT(ret == -EINPROGRESS); ret = 0; } out_free_path: btrfs_free_path(path); out_free_root: if (ret) btrfs_put_root(quota_root); out: if (ret) { ulist_free(fs_info->qgroup_ulist); fs_info->qgroup_ulist = NULL; btrfs_sysfs_del_qgroups(fs_info); } mutex_unlock(&fs_info->qgroup_ioctl_lock); if (ret && trans) btrfs_end_transaction(trans); else if (trans) ret = btrfs_end_transaction(trans); ulist_free(ulist); kfree(prealloc); return ret; } /* * It is possible to have outstanding ordered extents which reserved bytes * before we disabled. We need to fully flush delalloc, ordered extents, and a * commit to ensure that we don't leak such reservations, only to have them * come back if we re-enable. * * - enable simple quotas * - reserve space * - release it, store rsv_bytes in OE * - disable quotas * - enable simple quotas (qgroup rsv are all 0) * - OE finishes * - run delayed refs * - free rsv_bytes, resulting in miscounting or even underflow */ static int flush_reservations(struct btrfs_fs_info *fs_info) { struct btrfs_trans_handle *trans; int ret; ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false); if (ret) return ret; btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); trans = btrfs_join_transaction(fs_info->tree_root); if (IS_ERR(trans)) return PTR_ERR(trans); btrfs_commit_transaction(trans); return ret; } int btrfs_quota_disable(struct btrfs_fs_info *fs_info) { struct btrfs_root *quota_root; struct btrfs_trans_handle *trans = NULL; int ret = 0; /* * We need to have subvol_sem write locked to prevent races with * snapshot creation. */ lockdep_assert_held_write(&fs_info->subvol_sem); /* * Lock the cleaner mutex to prevent races with concurrent relocation, * because relocation may be building backrefs for blocks of the quota * root while we are deleting the root. This is like dropping fs roots * of deleted snapshots/subvolumes, we need the same protection. * * This also prevents races between concurrent tasks trying to disable * quotas, because we will unlock and relock qgroup_ioctl_lock across * BTRFS_FS_QUOTA_ENABLED changes. */ mutex_lock(&fs_info->cleaner_mutex); mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) goto out; /* * Unlock the qgroup_ioctl_lock mutex before waiting for the rescan worker to * complete. Otherwise we can deadlock because btrfs_remove_qgroup() needs * to lock that mutex while holding a transaction handle and the rescan * worker needs to commit a transaction. */ mutex_unlock(&fs_info->qgroup_ioctl_lock); /* * Request qgroup rescan worker to complete and wait for it. This wait * must be done before transaction start for quota disable since it may * deadlock with transaction by the qgroup rescan worker. */ clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); btrfs_qgroup_wait_for_completion(fs_info, false); ret = flush_reservations(fs_info); if (ret) goto out_unlock_cleaner; /* * 1 For the root item * * We should also reserve enough items for the quota tree deletion in * btrfs_clean_quota_tree but this is not done. * * Also, we must always start a transaction without holding the mutex * qgroup_ioctl_lock, see btrfs_quota_enable(). */ trans = btrfs_start_transaction(fs_info->tree_root, 1); mutex_lock(&fs_info->qgroup_ioctl_lock); if (IS_ERR(trans)) { ret = PTR_ERR(trans); trans = NULL; set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); goto out; } if (!fs_info->quota_root) goto out; spin_lock(&fs_info->qgroup_lock); quota_root = fs_info->quota_root; fs_info->quota_root = NULL; fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON; fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE; fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL; spin_unlock(&fs_info->qgroup_lock); btrfs_free_qgroup_config(fs_info); ret = btrfs_clean_quota_tree(trans, quota_root); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } ret = btrfs_del_root(trans, "a_root->root_key); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } spin_lock(&fs_info->trans_lock); list_del("a_root->dirty_list); spin_unlock(&fs_info->trans_lock); btrfs_tree_lock(quota_root->node); btrfs_clear_buffer_dirty(trans, quota_root->node); btrfs_tree_unlock(quota_root->node); btrfs_free_tree_block(trans, btrfs_root_id(quota_root), quota_root->node, 0, 1); btrfs_put_root(quota_root); out: mutex_unlock(&fs_info->qgroup_ioctl_lock); if (ret && trans) btrfs_end_transaction(trans); else if (trans) ret = btrfs_commit_transaction(trans); out_unlock_cleaner: mutex_unlock(&fs_info->cleaner_mutex); return ret; } static void qgroup_dirty(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup) { if (list_empty(&qgroup->dirty)) list_add(&qgroup->dirty, &fs_info->dirty_qgroups); } static void qgroup_iterator_add(struct list_head *head, struct btrfs_qgroup *qgroup) { if (!list_empty(&qgroup->iterator)) return; list_add_tail(&qgroup->iterator, head); } static void qgroup_iterator_clean(struct list_head *head) { while (!list_empty(head)) { struct btrfs_qgroup *qgroup; qgroup = list_first_entry(head, struct btrfs_qgroup, iterator); list_del_init(&qgroup->iterator); } } /* * The easy accounting, we're updating qgroup relationship whose child qgroup * only has exclusive extents. * * In this case, all exclusive extents will also be exclusive for parent, so * excl/rfer just get added/removed. * * So is qgroup reservation space, which should also be added/removed to * parent. * Or when child tries to release reservation space, parent will underflow its * reservation (for relationship adding case). * * Caller should hold fs_info->qgroup_lock. */ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, u64 ref_root, struct btrfs_qgroup *src, int sign) { struct btrfs_qgroup *qgroup; struct btrfs_qgroup *cur; LIST_HEAD(qgroup_list); u64 num_bytes = src->excl; int ret = 0; qgroup = find_qgroup_rb(fs_info, ref_root); if (!qgroup) goto out; qgroup_iterator_add(&qgroup_list, qgroup); list_for_each_entry(cur, &qgroup_list, iterator) { struct btrfs_qgroup_list *glist; qgroup->rfer += sign * num_bytes; qgroup->rfer_cmpr += sign * num_bytes; WARN_ON(sign < 0 && qgroup->excl < num_bytes); qgroup->excl += sign * num_bytes; qgroup->excl_cmpr += sign * num_bytes; if (sign > 0) qgroup_rsv_add_by_qgroup(fs_info, qgroup, src); else qgroup_rsv_release_by_qgroup(fs_info, qgroup, src); qgroup_dirty(fs_info, qgroup); /* Append parent qgroups to @qgroup_list. */ list_for_each_entry(glist, &qgroup->groups, next_group) qgroup_iterator_add(&qgroup_list, glist->group); } ret = 0; out: qgroup_iterator_clean(&qgroup_list); return ret; } /* * Quick path for updating qgroup with only excl refs. * * In that case, just update all parent will be enough. * Or we needs to do a full rescan. * Caller should also hold fs_info->qgroup_lock. * * Return 0 for quick update, return >0 for need to full rescan * and mark INCONSISTENT flag. * Return < 0 for other error. */ static int quick_update_accounting(struct btrfs_fs_info *fs_info, u64 src, u64 dst, int sign) { struct btrfs_qgroup *qgroup; int ret = 1; int err = 0; qgroup = find_qgroup_rb(fs_info, src); if (!qgroup) goto out; if (qgroup->excl == qgroup->rfer) { ret = 0; err = __qgroup_excl_accounting(fs_info, dst, qgroup, sign); if (err < 0) { ret = err; goto out; } } out: if (ret) fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; return ret; } int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_qgroup *parent; struct btrfs_qgroup *member; struct btrfs_qgroup_list *list; struct btrfs_qgroup_list *prealloc = NULL; int ret = 0; /* Check the level of src and dst first */ if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst)) return -EINVAL; mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) { ret = -ENOTCONN; goto out; } member = find_qgroup_rb(fs_info, src); parent = find_qgroup_rb(fs_info, dst); if (!member || !parent) { ret = -EINVAL; goto out; } /* check if such qgroup relation exist firstly */ list_for_each_entry(list, &member->groups, next_group) { if (list->group == parent) { ret = -EEXIST; goto out; } } prealloc = kzalloc(sizeof(*list), GFP_NOFS); if (!prealloc) { ret = -ENOMEM; goto out; } ret = add_qgroup_relation_item(trans, src, dst); if (ret) goto out; ret = add_qgroup_relation_item(trans, dst, src); if (ret) { del_qgroup_relation_item(trans, src, dst); goto out; } spin_lock(&fs_info->qgroup_lock); ret = __add_relation_rb(prealloc, member, parent); prealloc = NULL; if (ret < 0) { spin_unlock(&fs_info->qgroup_lock); goto out; } ret = quick_update_accounting(fs_info, src, dst, 1); spin_unlock(&fs_info->qgroup_lock); out: kfree(prealloc); mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret; } static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_qgroup *parent; struct btrfs_qgroup *member; struct btrfs_qgroup_list *list; bool found = false; int ret = 0; int ret2; if (!fs_info->quota_root) { ret = -ENOTCONN; goto out; } member = find_qgroup_rb(fs_info, src); parent = find_qgroup_rb(fs_info, dst); /* * The parent/member pair doesn't exist, then try to delete the dead * relation items only. */ if (!member || !parent) goto delete_item; /* check if such qgroup relation exist firstly */ list_for_each_entry(list, &member->groups, next_group) { if (list->group == parent) { found = true; break; } } delete_item: ret = del_qgroup_relation_item(trans, src, dst); if (ret < 0 && ret != -ENOENT) goto out; ret2 = del_qgroup_relation_item(trans, dst, src); if (ret2 < 0 && ret2 != -ENOENT) goto out; /* At least one deletion succeeded, return 0 */ if (!ret || !ret2) ret = 0; if (found) { spin_lock(&fs_info->qgroup_lock); del_relation_rb(fs_info, src, dst); ret = quick_update_accounting(fs_info, src, dst, -1); spin_unlock(&fs_info->qgroup_lock); } out: return ret; } int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst) { struct btrfs_fs_info *fs_info = trans->fs_info; int ret = 0; mutex_lock(&fs_info->qgroup_ioctl_lock); ret = __del_qgroup_relation(trans, src, dst); mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret; } int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; struct btrfs_qgroup *prealloc = NULL; int ret = 0; if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED) return 0; mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) { ret = -ENOTCONN; goto out; } quota_root = fs_info->quota_root; qgroup = find_qgroup_rb(fs_info, qgroupid); if (qgroup) { ret = -EEXIST; goto out; } prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS); if (!prealloc) { ret = -ENOMEM; goto out; } ret = add_qgroup_item(trans, quota_root, qgroupid); if (ret) goto out; spin_lock(&fs_info->qgroup_lock); qgroup = add_qgroup_rb(fs_info, prealloc, qgroupid); spin_unlock(&fs_info->qgroup_lock); prealloc = NULL; ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); out: mutex_unlock(&fs_info->qgroup_ioctl_lock); kfree(prealloc); return ret; } static bool qgroup_has_usage(struct btrfs_qgroup *qgroup) { return (qgroup->rfer > 0 || qgroup->rfer_cmpr > 0 || qgroup->excl > 0 || qgroup->excl_cmpr > 0 || qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] > 0 || qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] > 0 || qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS] > 0); } int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_qgroup *qgroup; struct btrfs_qgroup_list *list; int ret = 0; mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) { ret = -ENOTCONN; goto out; } qgroup = find_qgroup_rb(fs_info, qgroupid); if (!qgroup) { ret = -ENOENT; goto out; } if (is_fstree(qgroupid) && qgroup_has_usage(qgroup)) { ret = -EBUSY; goto out; } /* Check if there are no children of this qgroup */ if (!list_empty(&qgroup->members)) { ret = -EBUSY; goto out; } ret = del_qgroup_item(trans, qgroupid); if (ret && ret != -ENOENT) goto out; while (!list_empty(&qgroup->groups)) { list = list_first_entry(&qgroup->groups, struct btrfs_qgroup_list, next_group); ret = __del_qgroup_relation(trans, qgroupid, list->group->qgroupid); if (ret) goto out; } spin_lock(&fs_info->qgroup_lock); del_qgroup_rb(fs_info, qgroupid); spin_unlock(&fs_info->qgroup_lock); /* * Remove the qgroup from sysfs now without holding the qgroup_lock * spinlock, since the sysfs_remove_group() function needs to take * the mutex kernfs_mutex through kernfs_remove_by_name_ns(). */ btrfs_sysfs_del_one_qgroup(fs_info, qgroup); kfree(qgroup); out: mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret; } int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid, struct btrfs_qgroup_limit *limit) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_qgroup *qgroup; int ret = 0; /* Sometimes we would want to clear the limit on this qgroup. * To meet this requirement, we treat the -1 as a special value * which tell kernel to clear the limit on this qgroup. */ const u64 CLEAR_VALUE = -1; mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) { ret = -ENOTCONN; goto out; } qgroup = find_qgroup_rb(fs_info, qgroupid); if (!qgroup) { ret = -ENOENT; goto out; } spin_lock(&fs_info->qgroup_lock); if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER) { if (limit->max_rfer == CLEAR_VALUE) { qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER; limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER; qgroup->max_rfer = 0; } else { qgroup->max_rfer = limit->max_rfer; } } if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) { if (limit->max_excl == CLEAR_VALUE) { qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL; limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL; qgroup->max_excl = 0; } else { qgroup->max_excl = limit->max_excl; } } if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER) { if (limit->rsv_rfer == CLEAR_VALUE) { qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER; limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER; qgroup->rsv_rfer = 0; } else { qgroup->rsv_rfer = limit->rsv_rfer; } } if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL) { if (limit->rsv_excl == CLEAR_VALUE) { qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL; limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL; qgroup->rsv_excl = 0; } else { qgroup->rsv_excl = limit->rsv_excl; } } qgroup->lim_flags |= limit->flags; spin_unlock(&fs_info->qgroup_lock); ret = update_qgroup_limit_item(trans, qgroup); if (ret) { qgroup_mark_inconsistent(fs_info); btrfs_info(fs_info, "unable to update quota limit for %llu", qgroupid); } out: mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret; } /* * Inform qgroup to trace one dirty extent, its info is recorded in @record. * So qgroup can account it at transaction committing time. * * No lock version, caller must acquire delayed ref lock and allocated memory, * then call btrfs_qgroup_trace_extent_post() after exiting lock context. * * Return 0 for success insert * Return >0 for existing record, caller can free @record safely. * Error is not possible */ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_qgroup_extent_record *record) { struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node; struct rb_node *parent_node = NULL; struct btrfs_qgroup_extent_record *entry; u64 bytenr = record->bytenr; if (!btrfs_qgroup_full_accounting(fs_info)) return 1; lockdep_assert_held(&delayed_refs->lock); trace_btrfs_qgroup_trace_extent(fs_info, record); while (*p) { parent_node = *p; entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record, node); if (bytenr < entry->bytenr) { p = &(*p)->rb_left; } else if (bytenr > entry->bytenr) { p = &(*p)->rb_right; } else { if (record->data_rsv && !entry->data_rsv) { entry->data_rsv = record->data_rsv; entry->data_rsv_refroot = record->data_rsv_refroot; } return 1; } } rb_link_node(&record->node, parent_node, p); rb_insert_color(&record->node, &delayed_refs->dirty_extent_root); return 0; } /* * Post handler after qgroup_trace_extent_nolock(). * * NOTE: Current qgroup does the expensive backref walk at transaction * committing time with TRANS_STATE_COMMIT_DOING, this blocks incoming * new transaction. * This is designed to allow btrfs_find_all_roots() to get correct new_roots * result. * * However for old_roots there is no need to do backref walk at that time, * since we search commit roots to walk backref and result will always be * correct. * * Due to the nature of no lock version, we can't do backref there. * So we must call btrfs_qgroup_trace_extent_post() after exiting * spinlock context. * * TODO: If we can fix and prove btrfs_find_all_roots() can get correct result * using current root, then we can move all expensive backref walk out of * transaction committing, but not now as qgroup accounting will be wrong again. */ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, struct btrfs_qgroup_extent_record *qrecord) { struct btrfs_backref_walk_ctx ctx = { 0 }; int ret; if (!btrfs_qgroup_full_accounting(trans->fs_info)) return 0; /* * We are always called in a context where we are already holding a * transaction handle. Often we are called when adding a data delayed * reference from btrfs_truncate_inode_items() (truncating or unlinking), * in which case we will be holding a write lock on extent buffer from a * subvolume tree. In this case we can't allow btrfs_find_all_roots() to * acquire fs_info->commit_root_sem, because that is a higher level lock * that must be acquired before locking any extent buffers. * * So we want btrfs_find_all_roots() to not acquire the commit_root_sem * but we can't pass it a non-NULL transaction handle, because otherwise * it would not use commit roots and would lock extent buffers, causing * a deadlock if it ends up trying to read lock the same extent buffer * that was previously write locked at btrfs_truncate_inode_items(). * * So pass a NULL transaction handle to btrfs_find_all_roots() and * explicitly tell it to not acquire the commit_root_sem - if we are * holding a transaction handle we don't need its protection. */ ASSERT(trans != NULL); if (trans->fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING) return 0; ctx.bytenr = qrecord->bytenr; ctx.fs_info = trans->fs_info; ret = btrfs_find_all_roots(&ctx, true); if (ret < 0) { qgroup_mark_inconsistent(trans->fs_info); btrfs_warn(trans->fs_info, "error accounting new delayed refs extent (err code: %d), quota inconsistent", ret); return 0; } /* * Here we don't need to get the lock of * trans->transaction->delayed_refs, since inserted qrecord won't * be deleted, only qrecord->node may be modified (new qrecord insert) * * So modifying qrecord->old_roots is safe here */ qrecord->old_roots = ctx.roots; return 0; } /* * Inform qgroup to trace one dirty extent, specified by @bytenr and * @num_bytes. * So qgroup can account it at commit trans time. * * Better encapsulated version, with memory allocation and backref walk for * commit roots. * So this can sleep. * * Return 0 if the operation is done. * Return <0 for error, like memory allocation failure or invalid parameter * (NULL trans) */ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_qgroup_extent_record *record; struct btrfs_delayed_ref_root *delayed_refs; int ret; if (!btrfs_qgroup_full_accounting(fs_info) || bytenr == 0 || num_bytes == 0) return 0; record = kzalloc(sizeof(*record), GFP_NOFS); if (!record) return -ENOMEM; delayed_refs = &trans->transaction->delayed_refs; record->bytenr = bytenr; record->num_bytes = num_bytes; record->old_roots = NULL; spin_lock(&delayed_refs->lock); ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record); spin_unlock(&delayed_refs->lock); if (ret > 0) { kfree(record); return 0; } return btrfs_qgroup_trace_extent_post(trans, record); } /* * Inform qgroup to trace all leaf items of data * * Return 0 for success * Return <0 for error(ENOMEM) */ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, struct extent_buffer *eb) { struct btrfs_fs_info *fs_info = trans->fs_info; int nr = btrfs_header_nritems(eb); int i, extent_type, ret; struct btrfs_key key; struct btrfs_file_extent_item *fi; u64 bytenr, num_bytes; /* We can be called directly from walk_up_proc() */ if (!btrfs_qgroup_full_accounting(fs_info)) return 0; for (i = 0; i < nr; i++) { btrfs_item_key_to_cpu(eb, &key, i); if (key.type != BTRFS_EXTENT_DATA_KEY) continue; fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); /* filter out non qgroup-accountable extents */ extent_type = btrfs_file_extent_type(eb, fi); if (extent_type == BTRFS_FILE_EXTENT_INLINE) continue; bytenr = btrfs_file_extent_disk_bytenr(eb, fi); if (!bytenr) continue; num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi); ret = btrfs_qgroup_trace_extent(trans, bytenr, num_bytes); if (ret) return ret; } cond_resched(); return 0; } /* * Walk up the tree from the bottom, freeing leaves and any interior * nodes which have had all slots visited. If a node (leaf or * interior) is freed, the node above it will have it's slot * incremented. The root node will never be freed. * * At the end of this function, we should have a path which has all * slots incremented to the next position for a search. If we need to * read a new node it will be NULL and the node above it will have the * correct slot selected for a later read. * * If we increment the root nodes slot counter past the number of * elements, 1 is returned to signal completion of the search. */ static int adjust_slots_upwards(struct btrfs_path *path, int root_level) { int level = 0; int nr, slot; struct extent_buffer *eb; if (root_level == 0) return 1; while (level <= root_level) { eb = path->nodes[level]; nr = btrfs_header_nritems(eb); path->slots[level]++; slot = path->slots[level]; if (slot >= nr || level == 0) { /* * Don't free the root - we will detect this * condition after our loop and return a * positive value for caller to stop walking the tree. */ if (level != root_level) { btrfs_tree_unlock_rw(eb, path->locks[level]); path->locks[level] = 0; free_extent_buffer(eb); path->nodes[level] = NULL; path->slots[level] = 0; } } else { /* * We have a valid slot to walk back down * from. Stop here so caller can process these * new nodes. */ break; } level++; } eb = path->nodes[root_level]; if (path->slots[root_level] >= btrfs_header_nritems(eb)) return 1; return 0; } /* * Helper function to trace a subtree tree block swap. * * The swap will happen in highest tree block, but there may be a lot of * tree blocks involved. * * For example: * OO = Old tree blocks * NN = New tree blocks allocated during balance * * File tree (257) Reloc tree for 257 * L2 OO NN * / \ / \ * L1 OO OO (a) OO NN (a) * / \ / \ / \ / \ * L0 OO OO OO OO OO OO NN NN * (b) (c) (b) (c) * * When calling qgroup_trace_extent_swap(), we will pass: * @src_eb = OO(a) * @dst_path = [ nodes[1] = NN(a), nodes[0] = NN(c) ] * @dst_level = 0 * @root_level = 1 * * In that case, qgroup_trace_extent_swap() will search from OO(a) to * reach OO(c), then mark both OO(c) and NN(c) as qgroup dirty. * * The main work of qgroup_trace_extent_swap() can be split into 3 parts: * * 1) Tree search from @src_eb * It should acts as a simplified btrfs_search_slot(). * The key for search can be extracted from @dst_path->nodes[dst_level] * (first key). * * 2) Mark the final tree blocks in @src_path and @dst_path qgroup dirty * NOTE: In above case, OO(a) and NN(a) won't be marked qgroup dirty. * They should be marked during previous (@dst_level = 1) iteration. * * 3) Mark file extents in leaves dirty * We don't have good way to pick out new file extents only. * So we still follow the old method by scanning all file extents in * the leave. * * This function can free us from keeping two paths, thus later we only need * to care about how to iterate all new tree blocks in reloc tree. */ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, struct extent_buffer *src_eb, struct btrfs_path *dst_path, int dst_level, int root_level, bool trace_leaf) { struct btrfs_key key; struct btrfs_path *src_path; struct btrfs_fs_info *fs_info = trans->fs_info; u32 nodesize = fs_info->nodesize; int cur_level = root_level; int ret; BUG_ON(dst_level > root_level); /* Level mismatch */ if (btrfs_header_level(src_eb) != root_level) return -EINVAL; src_path = btrfs_alloc_path(); if (!src_path) { ret = -ENOMEM; goto out; } if (dst_level) btrfs_node_key_to_cpu(dst_path->nodes[dst_level], &key, 0); else btrfs_item_key_to_cpu(dst_path->nodes[dst_level], &key, 0); /* For src_path */ atomic_inc(&src_eb->refs); src_path->nodes[root_level] = src_eb; src_path->slots[root_level] = dst_path->slots[root_level]; src_path->locks[root_level] = 0; /* A simplified version of btrfs_search_slot() */ while (cur_level >= dst_level) { struct btrfs_key src_key; struct btrfs_key dst_key; if (src_path->nodes[cur_level] == NULL) { struct extent_buffer *eb; int parent_slot; eb = src_path->nodes[cur_level + 1]; parent_slot = src_path->slots[cur_level + 1]; eb = btrfs_read_node_slot(eb, parent_slot); if (IS_ERR(eb)) { ret = PTR_ERR(eb); goto out; } src_path->nodes[cur_level] = eb; btrfs_tree_read_lock(eb); src_path->locks[cur_level] = BTRFS_READ_LOCK; } src_path->slots[cur_level] = dst_path->slots[cur_level]; if (cur_level) { btrfs_node_key_to_cpu(dst_path->nodes[cur_level], &dst_key, dst_path->slots[cur_level]); btrfs_node_key_to_cpu(src_path->nodes[cur_level], &src_key, src_path->slots[cur_level]); } else { btrfs_item_key_to_cpu(dst_path->nodes[cur_level], &dst_key, dst_path->slots[cur_level]); btrfs_item_key_to_cpu(src_path->nodes[cur_level], &src_key, src_path->slots[cur_level]); } /* Content mismatch, something went wrong */ if (btrfs_comp_cpu_keys(&dst_key, &src_key)) { ret = -ENOENT; goto out; } cur_level--; } /* * Now both @dst_path and @src_path have been populated, record the tree * blocks for qgroup accounting. */ ret = btrfs_qgroup_trace_extent(trans, src_path->nodes[dst_level]->start, nodesize); if (ret < 0) goto out; ret = btrfs_qgroup_trace_extent(trans, dst_path->nodes[dst_level]->start, nodesize); if (ret < 0) goto out; /* Record leaf file extents */ if (dst_level == 0 && trace_leaf) { ret = btrfs_qgroup_trace_leaf_items(trans, src_path->nodes[0]); if (ret < 0) goto out; ret = btrfs_qgroup_trace_leaf_items(trans, dst_path->nodes[0]); } out: btrfs_free_path(src_path); return ret; } /* * Helper function to do recursive generation-aware depth-first search, to * locate all new tree blocks in a subtree of reloc tree. * * E.g. (OO = Old tree blocks, NN = New tree blocks, whose gen == last_snapshot) * reloc tree * L2 NN (a) * / \ * L1 OO NN (b) * / \ / \ * L0 OO OO OO NN * (c) (d) * If we pass: * @dst_path = [ nodes[1] = NN(b), nodes[0] = NULL ], * @cur_level = 1 * @root_level = 1 * * We will iterate through tree blocks NN(b), NN(d) and info qgroup to trace * above tree blocks along with their counter parts in file tree. * While during search, old tree blocks OO(c) will be skipped as tree block swap * won't affect OO(c). */ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans, struct extent_buffer *src_eb, struct btrfs_path *dst_path, int cur_level, int root_level, u64 last_snapshot, bool trace_leaf) { struct btrfs_fs_info *fs_info = trans->fs_info; struct extent_buffer *eb; bool need_cleanup = false; int ret = 0; int i; /* Level sanity check */ if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL - 1 || root_level < 0 || root_level >= BTRFS_MAX_LEVEL - 1 || root_level < cur_level) { btrfs_err_rl(fs_info, "%s: bad levels, cur_level=%d root_level=%d", __func__, cur_level, root_level); return -EUCLEAN; } /* Read the tree block if needed */ if (dst_path->nodes[cur_level] == NULL) { int parent_slot; u64 child_gen; /* * dst_path->nodes[root_level] must be initialized before * calling this function. */ if (cur_level == root_level) { btrfs_err_rl(fs_info, "%s: dst_path->nodes[%d] not initialized, root_level=%d cur_level=%d", __func__, root_level, root_level, cur_level); return -EUCLEAN; } /* * We need to get child blockptr/gen from parent before we can * read it. */ eb = dst_path->nodes[cur_level + 1]; parent_slot = dst_path->slots[cur_level + 1]; child_gen = btrfs_node_ptr_generation(eb, parent_slot); /* This node is old, no need to trace */ if (child_gen < last_snapshot) goto out; eb = btrfs_read_node_slot(eb, parent_slot); if (IS_ERR(eb)) { ret = PTR_ERR(eb); goto out; } dst_path->nodes[cur_level] = eb; dst_path->slots[cur_level] = 0; btrfs_tree_read_lock(eb); dst_path->locks[cur_level] = BTRFS_READ_LOCK; need_cleanup = true; } /* Now record this tree block and its counter part for qgroups */ ret = qgroup_trace_extent_swap(trans, src_eb, dst_path, cur_level, root_level, trace_leaf); if (ret < 0) goto cleanup; eb = dst_path->nodes[cur_level]; if (cur_level > 0) { /* Iterate all child tree blocks */ for (i = 0; i < btrfs_header_nritems(eb); i++) { /* Skip old tree blocks as they won't be swapped */ if (btrfs_node_ptr_generation(eb, i) < last_snapshot) continue; dst_path->slots[cur_level] = i; /* Recursive call (at most 7 times) */ ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, cur_level - 1, root_level, last_snapshot, trace_leaf); if (ret < 0) goto cleanup; } } cleanup: if (need_cleanup) { /* Clean up */ btrfs_tree_unlock_rw(dst_path->nodes[cur_level], dst_path->locks[cur_level]); free_extent_buffer(dst_path->nodes[cur_level]); dst_path->nodes[cur_level] = NULL; dst_path->slots[cur_level] = 0; dst_path->locks[cur_level] = 0; } out: return ret; } static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, struct extent_buffer *src_eb, struct extent_buffer *dst_eb, u64 last_snapshot, bool trace_leaf) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_path *dst_path = NULL; int level; int ret; if (!btrfs_qgroup_full_accounting(fs_info)) return 0; /* Wrong parameter order */ if (btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb)) { btrfs_err_rl(fs_info, "%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__, btrfs_header_generation(src_eb), btrfs_header_generation(dst_eb)); return -EUCLEAN; } if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) { ret = -EIO; goto out; } level = btrfs_header_level(dst_eb); dst_path = btrfs_alloc_path(); if (!dst_path) { ret = -ENOMEM; goto out; } /* For dst_path */ atomic_inc(&dst_eb->refs); dst_path->nodes[level] = dst_eb; dst_path->slots[level] = 0; dst_path->locks[level] = 0; /* Do the generation aware breadth-first search */ ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, level, level, last_snapshot, trace_leaf); if (ret < 0) goto out; ret = 0; out: btrfs_free_path(dst_path); if (ret < 0) qgroup_mark_inconsistent(fs_info); return ret; } /* * Inform qgroup to trace a whole subtree, including all its child tree * blocks and data. * The root tree block is specified by @root_eb. * * Normally used by relocation(tree block swap) and subvolume deletion. * * Return 0 for success * Return <0 for error(ENOMEM or tree search error) */ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, struct extent_buffer *root_eb, u64 root_gen, int root_level) { struct btrfs_fs_info *fs_info = trans->fs_info; int ret = 0; int level; u8 drop_subptree_thres; struct extent_buffer *eb = root_eb; struct btrfs_path *path = NULL; BUG_ON(root_level < 0 || root_level >= BTRFS_MAX_LEVEL); BUG_ON(root_eb == NULL); if (!btrfs_qgroup_full_accounting(fs_info)) return 0; spin_lock(&fs_info->qgroup_lock); drop_subptree_thres = fs_info->qgroup_drop_subtree_thres; spin_unlock(&fs_info->qgroup_lock); /* * This function only gets called for snapshot drop, if we hit a high * node here, it means we are going to change ownership for quite a lot * of extents, which will greatly slow down btrfs_commit_transaction(). * * So here if we find a high tree here, we just skip the accounting and * mark qgroup inconsistent. */ if (root_level >= drop_subptree_thres) { qgroup_mark_inconsistent(fs_info); return 0; } if (!extent_buffer_uptodate(root_eb)) { struct btrfs_tree_parent_check check = { .has_first_key = false, .transid = root_gen, .level = root_level }; ret = btrfs_read_extent_buffer(root_eb, &check); if (ret) goto out; } if (root_level == 0) { ret = btrfs_qgroup_trace_leaf_items(trans, root_eb); goto out; } path = btrfs_alloc_path(); if (!path) return -ENOMEM; /* * Walk down the tree. Missing extent blocks are filled in as * we go. Metadata is accounted every time we read a new * extent block. * * When we reach a leaf, we account for file extent items in it, * walk back up the tree (adjusting slot pointers as we go) * and restart the search process. */ atomic_inc(&root_eb->refs); /* For path */ path->nodes[root_level] = root_eb; path->slots[root_level] = 0; path->locks[root_level] = 0; /* so release_path doesn't try to unlock */ walk_down: level = root_level; while (level >= 0) { if (path->nodes[level] == NULL) { int parent_slot; u64 child_bytenr; /* * We need to get child blockptr from parent before we * can read it. */ eb = path->nodes[level + 1]; parent_slot = path->slots[level + 1]; child_bytenr = btrfs_node_blockptr(eb, parent_slot); eb = btrfs_read_node_slot(eb, parent_slot); if (IS_ERR(eb)) { ret = PTR_ERR(eb); goto out; } path->nodes[level] = eb; path->slots[level] = 0; btrfs_tree_read_lock(eb); path->locks[level] = BTRFS_READ_LOCK; ret = btrfs_qgroup_trace_extent(trans, child_bytenr, fs_info->nodesize); if (ret) goto out; } if (level == 0) { ret = btrfs_qgroup_trace_leaf_items(trans, path->nodes[level]); if (ret) goto out; /* Nonzero return here means we completed our search */ ret = adjust_slots_upwards(path, root_level); if (ret) break; /* Restart search with new slots */ goto walk_down; } level--; } ret = 0; out: btrfs_free_path(path); return ret; } static void qgroup_iterator_nested_add(struct list_head *head, struct btrfs_qgroup *qgroup) { if (!list_empty(&qgroup->nested_iterator)) return; list_add_tail(&qgroup->nested_iterator, head); } static void qgroup_iterator_nested_clean(struct list_head *head) { while (!list_empty(head)) { struct btrfs_qgroup *qgroup; qgroup = list_first_entry(head, struct btrfs_qgroup, nested_iterator); list_del_init(&qgroup->nested_iterator); } } #define UPDATE_NEW 0 #define UPDATE_OLD 1 /* * Walk all of the roots that points to the bytenr and adjust their refcnts. */ static void qgroup_update_refcnt(struct btrfs_fs_info *fs_info, struct ulist *roots, struct list_head *qgroups, u64 seq, int update_old) { struct ulist_node *unode; struct ulist_iterator uiter; struct btrfs_qgroup *qg; if (!roots) return; ULIST_ITER_INIT(&uiter); while ((unode = ulist_next(roots, &uiter))) { LIST_HEAD(tmp); qg = find_qgroup_rb(fs_info, unode->val); if (!qg) continue; qgroup_iterator_nested_add(qgroups, qg); qgroup_iterator_add(&tmp, qg); list_for_each_entry(qg, &tmp, iterator) { struct btrfs_qgroup_list *glist; if (update_old) btrfs_qgroup_update_old_refcnt(qg, seq, 1); else btrfs_qgroup_update_new_refcnt(qg, seq, 1); list_for_each_entry(glist, &qg->groups, next_group) { qgroup_iterator_nested_add(qgroups, glist->group); qgroup_iterator_add(&tmp, glist->group); } } qgroup_iterator_clean(&tmp); } } /* * Update qgroup rfer/excl counters. * Rfer update is easy, codes can explain themselves. * * Excl update is tricky, the update is split into 2 parts. * Part 1: Possible exclusive <-> sharing detect: * | A | !A | * ------------------------------------- * B | * | - | * ------------------------------------- * !B | + | ** | * ------------------------------------- * * Conditions: * A: cur_old_roots < nr_old_roots (not exclusive before) * !A: cur_old_roots == nr_old_roots (possible exclusive before) * B: cur_new_roots < nr_new_roots (not exclusive now) * !B: cur_new_roots == nr_new_roots (possible exclusive now) * * Results: * +: Possible sharing -> exclusive -: Possible exclusive -> sharing * *: Definitely not changed. **: Possible unchanged. * * For !A and !B condition, the exception is cur_old/new_roots == 0 case. * * To make the logic clear, we first use condition A and B to split * combination into 4 results. * * Then, for result "+" and "-", check old/new_roots == 0 case, as in them * only on variant maybe 0. * * Lastly, check result **, since there are 2 variants maybe 0, split them * again(2x2). * But this time we don't need to consider other things, the codes and logic * is easy to understand now. */ static void qgroup_update_counters(struct btrfs_fs_info *fs_info, struct list_head *qgroups, u64 nr_old_roots, u64 nr_new_roots, u64 num_bytes, u64 seq) { struct btrfs_qgroup *qg; list_for_each_entry(qg, qgroups, nested_iterator) { u64 cur_new_count, cur_old_count; bool dirty = false; cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq); cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq); trace_qgroup_update_counters(fs_info, qg, cur_old_count, cur_new_count); /* Rfer update part */ if (cur_old_count == 0 && cur_new_count > 0) { qg->rfer += num_bytes; qg->rfer_cmpr += num_bytes; dirty = true; } if (cur_old_count > 0 && cur_new_count == 0) { qg->rfer -= num_bytes; qg->rfer_cmpr -= num_bytes; dirty = true; } /* Excl update part */ /* Exclusive/none -> shared case */ if (cur_old_count == nr_old_roots && cur_new_count < nr_new_roots) { /* Exclusive -> shared */ if (cur_old_count != 0) { qg->excl -= num_bytes; qg->excl_cmpr -= num_bytes; dirty = true; } } /* Shared -> exclusive/none case */ if (cur_old_count < nr_old_roots && cur_new_count == nr_new_roots) { /* Shared->exclusive */ if (cur_new_count != 0) { qg->excl += num_bytes; qg->excl_cmpr += num_bytes; dirty = true; } } /* Exclusive/none -> exclusive/none case */ if (cur_old_count == nr_old_roots && cur_new_count == nr_new_roots) { if (cur_old_count == 0) { /* None -> exclusive/none */ if (cur_new_count != 0) { /* None -> exclusive */ qg->excl += num_bytes; qg->excl_cmpr += num_bytes; dirty = true; } /* None -> none, nothing changed */ } else { /* Exclusive -> exclusive/none */ if (cur_new_count == 0) { /* Exclusive -> none */ qg->excl -= num_bytes; qg->excl_cmpr -= num_bytes; dirty = true; } /* Exclusive -> exclusive, nothing changed */ } } if (dirty) qgroup_dirty(fs_info, qg); } } /* * Check if the @roots potentially is a list of fs tree roots * * Return 0 for definitely not a fs/subvol tree roots ulist * Return 1 for possible fs/subvol tree roots in the list (considering an empty * one as well) */ static int maybe_fs_roots(struct ulist *roots) { struct ulist_node *unode; struct ulist_iterator uiter; /* Empty one, still possible for fs roots */ if (!roots || roots->nnodes == 0) return 1; ULIST_ITER_INIT(&uiter); unode = ulist_next(roots, &uiter); if (!unode) return 1; /* * If it contains fs tree roots, then it must belong to fs/subvol * trees. * If it contains a non-fs tree, it won't be shared with fs/subvol trees. */ return is_fstree(unode->val); } int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, struct ulist *old_roots, struct ulist *new_roots) { struct btrfs_fs_info *fs_info = trans->fs_info; LIST_HEAD(qgroups); u64 seq; u64 nr_new_roots = 0; u64 nr_old_roots = 0; int ret = 0; /* * If quotas get disabled meanwhile, the resources need to be freed and * we can't just exit here. */ if (!btrfs_qgroup_full_accounting(fs_info) || fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING) goto out_free; if (new_roots) { if (!maybe_fs_roots(new_roots)) goto out_free; nr_new_roots = new_roots->nnodes; } if (old_roots) { if (!maybe_fs_roots(old_roots)) goto out_free; nr_old_roots = old_roots->nnodes; } /* Quick exit, either not fs tree roots, or won't affect any qgroup */ if (nr_old_roots == 0 && nr_new_roots == 0) goto out_free; BUG_ON(!fs_info->quota_root); trace_btrfs_qgroup_account_extent(fs_info, trans->transid, bytenr, num_bytes, nr_old_roots, nr_new_roots); mutex_lock(&fs_info->qgroup_rescan_lock); if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { if (fs_info->qgroup_rescan_progress.objectid <= bytenr) { mutex_unlock(&fs_info->qgroup_rescan_lock); ret = 0; goto out_free; } } mutex_unlock(&fs_info->qgroup_rescan_lock); spin_lock(&fs_info->qgroup_lock); seq = fs_info->qgroup_seq; /* Update old refcnts using old_roots */ qgroup_update_refcnt(fs_info, old_roots, &qgroups, seq, UPDATE_OLD); /* Update new refcnts using new_roots */ qgroup_update_refcnt(fs_info, new_roots, &qgroups, seq, UPDATE_NEW); qgroup_update_counters(fs_info, &qgroups, nr_old_roots, nr_new_roots, num_bytes, seq); /* * We're done using the iterator, release all its qgroups while holding * fs_info->qgroup_lock so that we don't race with btrfs_remove_qgroup() * and trigger use-after-free accesses to qgroups. */ qgroup_iterator_nested_clean(&qgroups); /* * Bump qgroup_seq to avoid seq overlap */ fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1; spin_unlock(&fs_info->qgroup_lock); out_free: ulist_free(old_roots); ulist_free(new_roots); return ret; } int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_qgroup_extent_record *record; struct btrfs_delayed_ref_root *delayed_refs; struct ulist *new_roots = NULL; struct rb_node *node; u64 num_dirty_extents = 0; u64 qgroup_to_skip; int ret = 0; if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) return 0; delayed_refs = &trans->transaction->delayed_refs; qgroup_to_skip = delayed_refs->qgroup_to_skip; while ((node = rb_first(&delayed_refs->dirty_extent_root))) { record = rb_entry(node, struct btrfs_qgroup_extent_record, node); num_dirty_extents++; trace_btrfs_qgroup_account_extents(fs_info, record); if (!ret && !(fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)) { struct btrfs_backref_walk_ctx ctx = { 0 }; ctx.bytenr = record->bytenr; ctx.fs_info = fs_info; /* * Old roots should be searched when inserting qgroup * extent record. * * But for INCONSISTENT (NO_ACCOUNTING) -> rescan case, * we may have some record inserted during * NO_ACCOUNTING (thus no old_roots populated), but * later we start rescan, which clears NO_ACCOUNTING, * leaving some inserted records without old_roots * populated. * * Those cases are rare and should not cause too much * time spent during commit_transaction(). */ if (!record->old_roots) { /* Search commit root to find old_roots */ ret = btrfs_find_all_roots(&ctx, false); if (ret < 0) goto cleanup; record->old_roots = ctx.roots; ctx.roots = NULL; } /* Free the reserved data space */ btrfs_qgroup_free_refroot(fs_info, record->data_rsv_refroot, record->data_rsv, BTRFS_QGROUP_RSV_DATA); /* * Use BTRFS_SEQ_LAST as time_seq to do special search, * which doesn't lock tree or delayed_refs and search * current root. It's safe inside commit_transaction(). */ ctx.trans = trans; ctx.time_seq = BTRFS_SEQ_LAST; ret = btrfs_find_all_roots(&ctx, false); if (ret < 0) goto cleanup; new_roots = ctx.roots; if (qgroup_to_skip) { ulist_del(new_roots, qgroup_to_skip, 0); ulist_del(record->old_roots, qgroup_to_skip, 0); } ret = btrfs_qgroup_account_extent(trans, record->bytenr, record->num_bytes, record->old_roots, new_roots); record->old_roots = NULL; new_roots = NULL; } cleanup: ulist_free(record->old_roots); ulist_free(new_roots); new_roots = NULL; rb_erase(node, &delayed_refs->dirty_extent_root); kfree(record); } trace_qgroup_num_dirty_extents(fs_info, trans->transid, num_dirty_extents); return ret; } /* * Writes all changed qgroups to disk. * Called by the transaction commit path and the qgroup assign ioctl. */ int btrfs_run_qgroups(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; int ret = 0; /* * In case we are called from the qgroup assign ioctl, assert that we * are holding the qgroup_ioctl_lock, otherwise we can race with a quota * disable operation (ioctl) and access a freed quota root. */ if (trans->transaction->state != TRANS_STATE_COMMIT_DOING) lockdep_assert_held(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) return ret; spin_lock(&fs_info->qgroup_lock); while (!list_empty(&fs_info->dirty_qgroups)) { struct btrfs_qgroup *qgroup; qgroup = list_first_entry(&fs_info->dirty_qgroups, struct btrfs_qgroup, dirty); list_del_init(&qgroup->dirty); spin_unlock(&fs_info->qgroup_lock); ret = update_qgroup_info_item(trans, qgroup); if (ret) qgroup_mark_inconsistent(fs_info); ret = update_qgroup_limit_item(trans, qgroup); if (ret) qgroup_mark_inconsistent(fs_info); spin_lock(&fs_info->qgroup_lock); } if (btrfs_qgroup_enabled(fs_info)) fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON; else fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON; spin_unlock(&fs_info->qgroup_lock); ret = update_qgroup_status_item(trans); if (ret) qgroup_mark_inconsistent(fs_info); return ret; } static int qgroup_auto_inherit(struct btrfs_fs_info *fs_info, u64 inode_rootid, struct btrfs_qgroup_inherit **inherit) { int i = 0; u64 num_qgroups = 0; struct btrfs_qgroup *inode_qg; struct btrfs_qgroup_list *qg_list; struct btrfs_qgroup_inherit *res; size_t struct_sz; u64 *qgids; if (*inherit) return -EEXIST; inode_qg = find_qgroup_rb(fs_info, inode_rootid); if (!inode_qg) return -ENOENT; num_qgroups = list_count_nodes(&inode_qg->groups); if (!num_qgroups) return 0; struct_sz = struct_size(res, qgroups, num_qgroups); if (struct_sz == SIZE_MAX) return -ERANGE; res = kzalloc(struct_sz, GFP_NOFS); if (!res) return -ENOMEM; res->num_qgroups = num_qgroups; qgids = res->qgroups; list_for_each_entry(qg_list, &inode_qg->groups, next_group) qgids[i] = qg_list->group->qgroupid; *inherit = res; return 0; } /* * Copy the accounting information between qgroups. This is necessary * when a snapshot or a subvolume is created. Throwing an error will * cause a transaction abort so we take extra care here to only error * when a readonly fs is a reasonable outcome. */ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, u64 objectid, u64 inode_rootid, struct btrfs_qgroup_inherit *inherit) { int ret = 0; int i; u64 *i_qgroups; bool committing = false; struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *quota_root; struct btrfs_qgroup *srcgroup; struct btrfs_qgroup *dstgroup; struct btrfs_qgroup *prealloc; struct btrfs_qgroup_list **qlist_prealloc = NULL; bool free_inherit = false; bool need_rescan = false; u32 level_size = 0; u64 nums; prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS); if (!prealloc) return -ENOMEM; /* * There are only two callers of this function. * * One in create_subvol() in the ioctl context, which needs to hold * the qgroup_ioctl_lock. * * The other one in create_pending_snapshot() where no other qgroup * code can modify the fs as they all need to either start a new trans * or hold a trans handler, thus we don't need to hold * qgroup_ioctl_lock. * This would avoid long and complex lock chain and make lockdep happy. */ spin_lock(&fs_info->trans_lock); if (trans->transaction->state == TRANS_STATE_COMMIT_DOING) committing = true; spin_unlock(&fs_info->trans_lock); if (!committing) mutex_lock(&fs_info->qgroup_ioctl_lock); if (!btrfs_qgroup_enabled(fs_info)) goto out; quota_root = fs_info->quota_root; if (!quota_root) { ret = -EINVAL; goto out; } if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE && !inherit) { ret = qgroup_auto_inherit(fs_info, inode_rootid, &inherit); if (ret) goto out; free_inherit = true; } if (inherit) { i_qgroups = (u64 *)(inherit + 1); nums = inherit->num_qgroups + 2 * inherit->num_ref_copies + 2 * inherit->num_excl_copies; for (i = 0; i < nums; ++i) { srcgroup = find_qgroup_rb(fs_info, *i_qgroups); /* * Zero out invalid groups so we can ignore * them later. */ if (!srcgroup || ((srcgroup->qgroupid >> 48) <= (objectid >> 48))) *i_qgroups = 0ULL; ++i_qgroups; } } /* * create a tracking group for the subvol itself */ ret = add_qgroup_item(trans, quota_root, objectid); if (ret) goto out; /* * add qgroup to all inherited groups */ if (inherit) { i_qgroups = (u64 *)(inherit + 1); for (i = 0; i < inherit->num_qgroups; ++i, ++i_qgroups) { if (*i_qgroups == 0) continue; ret = add_qgroup_relation_item(trans, objectid, *i_qgroups); if (ret && ret != -EEXIST) goto out; ret = add_qgroup_relation_item(trans, *i_qgroups, objectid); if (ret && ret != -EEXIST) goto out; } ret = 0; qlist_prealloc = kcalloc(inherit->num_qgroups, sizeof(struct btrfs_qgroup_list *), GFP_NOFS); if (!qlist_prealloc) { ret = -ENOMEM; goto out; } for (int i = 0; i < inherit->num_qgroups; i++) { qlist_prealloc[i] = kzalloc(sizeof(struct btrfs_qgroup_list), GFP_NOFS); if (!qlist_prealloc[i]) { ret = -ENOMEM; goto out; } } } spin_lock(&fs_info->qgroup_lock); dstgroup = add_qgroup_rb(fs_info, prealloc, objectid); prealloc = NULL; if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) { dstgroup->lim_flags = inherit->lim.flags; dstgroup->max_rfer = inherit->lim.max_rfer; dstgroup->max_excl = inherit->lim.max_excl; dstgroup->rsv_rfer = inherit->lim.rsv_rfer; dstgroup->rsv_excl = inherit->lim.rsv_excl; qgroup_dirty(fs_info, dstgroup); } if (srcid && btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL) { srcgroup = find_qgroup_rb(fs_info, srcid); if (!srcgroup) goto unlock; /* * We call inherit after we clone the root in order to make sure * our counts don't go crazy, so at this point the only * difference between the two roots should be the root node. */ level_size = fs_info->nodesize; dstgroup->rfer = srcgroup->rfer; dstgroup->rfer_cmpr = srcgroup->rfer_cmpr; dstgroup->excl = level_size; dstgroup->excl_cmpr = level_size; srcgroup->excl = level_size; srcgroup->excl_cmpr = level_size; /* inherit the limit info */ dstgroup->lim_flags = srcgroup->lim_flags; dstgroup->max_rfer = srcgroup->max_rfer; dstgroup->max_excl = srcgroup->max_excl; dstgroup->rsv_rfer = srcgroup->rsv_rfer; dstgroup->rsv_excl = srcgroup->rsv_excl; qgroup_dirty(fs_info, dstgroup); qgroup_dirty(fs_info, srcgroup); } if (!inherit) goto unlock; i_qgroups = (u64 *)(inherit + 1); for (i = 0; i < inherit->num_qgroups; ++i) { if (*i_qgroups) { ret = add_relation_rb(fs_info, qlist_prealloc[i], objectid, *i_qgroups); qlist_prealloc[i] = NULL; if (ret) goto unlock; } ++i_qgroups; /* * If we're doing a snapshot, and adding the snapshot to a new * qgroup, the numbers are guaranteed to be incorrect. */ if (srcid) need_rescan = true; } for (i = 0; i < inherit->num_ref_copies; ++i, i_qgroups += 2) { struct btrfs_qgroup *src; struct btrfs_qgroup *dst; if (!i_qgroups[0] || !i_qgroups[1]) continue; src = find_qgroup_rb(fs_info, i_qgroups[0]); dst = find_qgroup_rb(fs_info, i_qgroups[1]); if (!src || !dst) { ret = -EINVAL; goto unlock; } dst->rfer = src->rfer - level_size; dst->rfer_cmpr = src->rfer_cmpr - level_size; /* Manually tweaking numbers certainly needs a rescan */ need_rescan = true; } for (i = 0; i < inherit->num_excl_copies; ++i, i_qgroups += 2) { struct btrfs_qgroup *src; struct btrfs_qgroup *dst; if (!i_qgroups[0] || !i_qgroups[1]) continue; src = find_qgroup_rb(fs_info, i_qgroups[0]); dst = find_qgroup_rb(fs_info, i_qgroups[1]); if (!src || !dst) { ret = -EINVAL; goto unlock; } dst->excl = src->excl + level_size; dst->excl_cmpr = src->excl_cmpr + level_size; need_rescan = true; } unlock: spin_unlock(&fs_info->qgroup_lock); if (!ret) ret = btrfs_sysfs_add_one_qgroup(fs_info, dstgroup); out: if (!committing) mutex_unlock(&fs_info->qgroup_ioctl_lock); if (need_rescan) qgroup_mark_inconsistent(fs_info); if (qlist_prealloc) { for (int i = 0; i < inherit->num_qgroups; i++) kfree(qlist_prealloc[i]); kfree(qlist_prealloc); } if (free_inherit) kfree(inherit); kfree(prealloc); return ret; } static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes) { if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) && qgroup_rsv_total(qg) + (s64)qg->rfer + num_bytes > qg->max_rfer) return false; if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) && qgroup_rsv_total(qg) + (s64)qg->excl + num_bytes > qg->max_excl) return false; return true; } static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce, enum btrfs_qgroup_rsv_type type) { struct btrfs_qgroup *qgroup; struct btrfs_fs_info *fs_info = root->fs_info; u64 ref_root = root->root_key.objectid; int ret = 0; LIST_HEAD(qgroup_list); if (!is_fstree(ref_root)) return 0; if (num_bytes == 0) return 0; if (test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags) && capable(CAP_SYS_RESOURCE)) enforce = false; spin_lock(&fs_info->qgroup_lock); if (!fs_info->quota_root) goto out; qgroup = find_qgroup_rb(fs_info, ref_root); if (!qgroup) goto out; qgroup_iterator_add(&qgroup_list, qgroup); list_for_each_entry(qgroup, &qgroup_list, iterator) { struct btrfs_qgroup_list *glist; if (enforce && !qgroup_check_limits(qgroup, num_bytes)) { ret = -EDQUOT; goto out; } list_for_each_entry(glist, &qgroup->groups, next_group) qgroup_iterator_add(&qgroup_list, glist->group); } ret = 0; /* * no limits exceeded, now record the reservation into all qgroups */ list_for_each_entry(qgroup, &qgroup_list, iterator) qgroup_rsv_add(fs_info, qgroup, num_bytes, type); out: qgroup_iterator_clean(&qgroup_list); spin_unlock(&fs_info->qgroup_lock); return ret; } /* * Free @num_bytes of reserved space with @type for qgroup. (Normally level 0 * qgroup). * * Will handle all higher level qgroup too. * * NOTE: If @num_bytes is (u64)-1, this means to free all bytes of this qgroup. * This special case is only used for META_PERTRANS type. */ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, u64 ref_root, u64 num_bytes, enum btrfs_qgroup_rsv_type type) { struct btrfs_qgroup *qgroup; LIST_HEAD(qgroup_list); if (!is_fstree(ref_root)) return; if (num_bytes == 0) return; if (num_bytes == (u64)-1 && type != BTRFS_QGROUP_RSV_META_PERTRANS) { WARN(1, "%s: Invalid type to free", __func__); return; } spin_lock(&fs_info->qgroup_lock); if (!fs_info->quota_root) goto out; qgroup = find_qgroup_rb(fs_info, ref_root); if (!qgroup) goto out; if (num_bytes == (u64)-1) /* * We're freeing all pertrans rsv, get reserved value from * level 0 qgroup as real num_bytes to free. */ num_bytes = qgroup->rsv.values[type]; qgroup_iterator_add(&qgroup_list, qgroup); list_for_each_entry(qgroup, &qgroup_list, iterator) { struct btrfs_qgroup_list *glist; qgroup_rsv_release(fs_info, qgroup, num_bytes, type); list_for_each_entry(glist, &qgroup->groups, next_group) { qgroup_iterator_add(&qgroup_list, glist->group); } } out: qgroup_iterator_clean(&qgroup_list); spin_unlock(&fs_info->qgroup_lock); } /* * Check if the leaf is the last leaf. Which means all node pointers * are at their last position. */ static bool is_last_leaf(struct btrfs_path *path) { int i; for (i = 1; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) { if (path->slots[i] != btrfs_header_nritems(path->nodes[i]) - 1) return false; } return true; } /* * returns < 0 on error, 0 when more leafs are to be scanned. * returns 1 when done. */ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans, struct btrfs_path *path) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *extent_root; struct btrfs_key found; struct extent_buffer *scratch_leaf = NULL; u64 num_bytes; bool done; int slot; int ret; if (!btrfs_qgroup_full_accounting(fs_info)) return 1; mutex_lock(&fs_info->qgroup_rescan_lock); extent_root = btrfs_extent_root(fs_info, fs_info->qgroup_rescan_progress.objectid); ret = btrfs_search_slot_for_read(extent_root, &fs_info->qgroup_rescan_progress, path, 1, 0); btrfs_debug(fs_info, "current progress key (%llu %u %llu), search_slot ret %d", fs_info->qgroup_rescan_progress.objectid, fs_info->qgroup_rescan_progress.type, fs_info->qgroup_rescan_progress.offset, ret); if (ret) { /* * The rescan is about to end, we will not be scanning any * further blocks. We cannot unset the RESCAN flag here, because * we want to commit the transaction if everything went well. * To make the live accounting work in this phase, we set our * scan progress pointer such that every real extent objectid * will be smaller. */ fs_info->qgroup_rescan_progress.objectid = (u64)-1; btrfs_release_path(path); mutex_unlock(&fs_info->qgroup_rescan_lock); return ret; } done = is_last_leaf(path); btrfs_item_key_to_cpu(path->nodes[0], &found, btrfs_header_nritems(path->nodes[0]) - 1); fs_info->qgroup_rescan_progress.objectid = found.objectid + 1; scratch_leaf = btrfs_clone_extent_buffer(path->nodes[0]); if (!scratch_leaf) { ret = -ENOMEM; mutex_unlock(&fs_info->qgroup_rescan_lock); goto out; } slot = path->slots[0]; btrfs_release_path(path); mutex_unlock(&fs_info->qgroup_rescan_lock); for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) { struct btrfs_backref_walk_ctx ctx = { 0 }; btrfs_item_key_to_cpu(scratch_leaf, &found, slot); if (found.type != BTRFS_EXTENT_ITEM_KEY && found.type != BTRFS_METADATA_ITEM_KEY) continue; if (found.type == BTRFS_METADATA_ITEM_KEY) num_bytes = fs_info->nodesize; else num_bytes = found.offset; ctx.bytenr = found.objectid; ctx.fs_info = fs_info; ret = btrfs_find_all_roots(&ctx, false); if (ret < 0) goto out; /* For rescan, just pass old_roots as NULL */ ret = btrfs_qgroup_account_extent(trans, found.objectid, num_bytes, NULL, ctx.roots); if (ret < 0) goto out; } out: if (scratch_leaf) free_extent_buffer(scratch_leaf); if (done && !ret) { ret = 1; fs_info->qgroup_rescan_progress.objectid = (u64)-1; } return ret; } static bool rescan_should_stop(struct btrfs_fs_info *fs_info) { if (btrfs_fs_closing(fs_info)) return true; if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) return true; if (!btrfs_qgroup_enabled(fs_info)) return true; if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN) return true; return false; } static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) { struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info, qgroup_rescan_work); struct btrfs_path *path; struct btrfs_trans_handle *trans = NULL; int err = -ENOMEM; int ret = 0; bool stopped = false; bool did_leaf_rescans = false; if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) return; path = btrfs_alloc_path(); if (!path) goto out; /* * Rescan should only search for commit root, and any later difference * should be recorded by qgroup */ path->search_commit_root = 1; path->skip_locking = 1; err = 0; while (!err && !(stopped = rescan_should_stop(fs_info))) { trans = btrfs_start_transaction(fs_info->fs_root, 0); if (IS_ERR(trans)) { err = PTR_ERR(trans); break; } err = qgroup_rescan_leaf(trans, path); did_leaf_rescans = true; if (err > 0) btrfs_commit_transaction(trans); else btrfs_end_transaction(trans); } out: btrfs_free_path(path); mutex_lock(&fs_info->qgroup_rescan_lock); if (err > 0 && fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) { fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; } else if (err < 0 || stopped) { fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; } mutex_unlock(&fs_info->qgroup_rescan_lock); /* * Only update status, since the previous part has already updated the * qgroup info, and only if we did any actual work. This also prevents * race with a concurrent quota disable, which has already set * fs_info->quota_root to NULL and cleared BTRFS_FS_QUOTA_ENABLED at * btrfs_quota_disable(). */ if (did_leaf_rescans) { trans = btrfs_start_transaction(fs_info->quota_root, 1); if (IS_ERR(trans)) { err = PTR_ERR(trans); trans = NULL; btrfs_err(fs_info, "fail to start transaction for status update: %d", err); } } else { trans = NULL; } mutex_lock(&fs_info->qgroup_rescan_lock); if (!stopped || fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN) fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; if (trans) { ret = update_qgroup_status_item(trans); if (ret < 0) { err = ret; btrfs_err(fs_info, "fail to update qgroup status: %d", err); } } fs_info->qgroup_rescan_running = false; fs_info->qgroup_flags &= ~BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN; complete_all(&fs_info->qgroup_rescan_completion); mutex_unlock(&fs_info->qgroup_rescan_lock); if (!trans) return; btrfs_end_transaction(trans); if (stopped) { btrfs_info(fs_info, "qgroup scan paused"); } else if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN) { btrfs_info(fs_info, "qgroup scan cancelled"); } else if (err >= 0) { btrfs_info(fs_info, "qgroup scan completed%s", err > 0 ? " (inconsistency flag cleared)" : ""); } else { btrfs_err(fs_info, "qgroup scan failed with %d", err); } } /* * Checks that (a) no rescan is running and (b) quota is enabled. Allocates all * memory required for the rescan context. */ static int qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, int init_flags) { int ret = 0; if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) { btrfs_warn(fs_info, "qgroup rescan init failed, running in simple mode"); return -EINVAL; } if (!init_flags) { /* we're resuming qgroup rescan at mount time */ if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)) { btrfs_warn(fs_info, "qgroup rescan init failed, qgroup rescan is not queued"); ret = -EINVAL; } else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) { btrfs_warn(fs_info, "qgroup rescan init failed, qgroup is not enabled"); ret = -EINVAL; } if (ret) return ret; } mutex_lock(&fs_info->qgroup_rescan_lock); if (init_flags) { if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { btrfs_warn(fs_info, "qgroup rescan is already in progress"); ret = -EINPROGRESS; } else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) { btrfs_warn(fs_info, "qgroup rescan init failed, qgroup is not enabled"); ret = -EINVAL; } else if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED) { /* Quota disable is in progress */ ret = -EBUSY; } if (ret) { mutex_unlock(&fs_info->qgroup_rescan_lock); return ret; } fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN; } memset(&fs_info->qgroup_rescan_progress, 0, sizeof(fs_info->qgroup_rescan_progress)); fs_info->qgroup_flags &= ~(BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN | BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING); fs_info->qgroup_rescan_progress.objectid = progress_objectid; init_completion(&fs_info->qgroup_rescan_completion); mutex_unlock(&fs_info->qgroup_rescan_lock); btrfs_init_work(&fs_info->qgroup_rescan_work, btrfs_qgroup_rescan_worker, NULL); return 0; } static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info) { struct rb_node *n; struct btrfs_qgroup *qgroup; spin_lock(&fs_info->qgroup_lock); /* clear all current qgroup tracking information */ for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) { qgroup = rb_entry(n, struct btrfs_qgroup, node); qgroup->rfer = 0; qgroup->rfer_cmpr = 0; qgroup->excl = 0; qgroup->excl_cmpr = 0; qgroup_dirty(fs_info, qgroup); } spin_unlock(&fs_info->qgroup_lock); } int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info) { int ret = 0; struct btrfs_trans_handle *trans; ret = qgroup_rescan_init(fs_info, 0, 1); if (ret) return ret; /* * We have set the rescan_progress to 0, which means no more * delayed refs will be accounted by btrfs_qgroup_account_ref. * However, btrfs_qgroup_account_ref may be right after its call * to btrfs_find_all_roots, in which case it would still do the * accounting. * To solve this, we're committing the transaction, which will * ensure we run all delayed refs and only after that, we are * going to clear all tracking information for a clean start. */ trans = btrfs_attach_transaction_barrier(fs_info->fs_root); if (IS_ERR(trans) && trans != ERR_PTR(-ENOENT)) { fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; return PTR_ERR(trans); } else if (trans != ERR_PTR(-ENOENT)) { ret = btrfs_commit_transaction(trans); if (ret) { fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; return ret; } } qgroup_rescan_zero_tracking(fs_info); mutex_lock(&fs_info->qgroup_rescan_lock); fs_info->qgroup_rescan_running = true; btrfs_queue_work(fs_info->qgroup_rescan_workers, &fs_info->qgroup_rescan_work); mutex_unlock(&fs_info->qgroup_rescan_lock); return 0; } int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info, bool interruptible) { int running; int ret = 0; mutex_lock(&fs_info->qgroup_rescan_lock); running = fs_info->qgroup_rescan_running; mutex_unlock(&fs_info->qgroup_rescan_lock); if (!running) return 0; if (interruptible) ret = wait_for_completion_interruptible( &fs_info->qgroup_rescan_completion); else wait_for_completion(&fs_info->qgroup_rescan_completion); return ret; } /* * this is only called from open_ctree where we're still single threaded, thus * locking is omitted here. */ void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info) { if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { mutex_lock(&fs_info->qgroup_rescan_lock); fs_info->qgroup_rescan_running = true; btrfs_queue_work(fs_info->qgroup_rescan_workers, &fs_info->qgroup_rescan_work); mutex_unlock(&fs_info->qgroup_rescan_lock); } } #define rbtree_iterate_from_safe(node, next, start) \ for (node = start; node && ({ next = rb_next(node); 1;}); node = next) static int qgroup_unreserve_range(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, u64 len) { struct rb_node *node; struct rb_node *next; struct ulist_node *entry; int ret = 0; node = reserved->range_changed.root.rb_node; if (!node) return 0; while (node) { entry = rb_entry(node, struct ulist_node, rb_node); if (entry->val < start) node = node->rb_right; else node = node->rb_left; } if (entry->val > start && rb_prev(&entry->rb_node)) entry = rb_entry(rb_prev(&entry->rb_node), struct ulist_node, rb_node); rbtree_iterate_from_safe(node, next, &entry->rb_node) { u64 entry_start; u64 entry_end; u64 entry_len; int clear_ret; entry = rb_entry(node, struct ulist_node, rb_node); entry_start = entry->val; entry_end = entry->aux; entry_len = entry_end - entry_start + 1; if (entry_start >= start + len) break; if (entry_start + entry_len <= start) continue; /* * Now the entry is in [start, start + len), revert the * EXTENT_QGROUP_RESERVED bit. */ clear_ret = clear_extent_bits(&inode->io_tree, entry_start, entry_end, EXTENT_QGROUP_RESERVED); if (!ret && clear_ret < 0) ret = clear_ret; ulist_del(&reserved->range_changed, entry->val, entry->aux); if (likely(reserved->bytes_changed >= entry_len)) { reserved->bytes_changed -= entry_len; } else { WARN_ON(1); reserved->bytes_changed = 0; } } return ret; } /* * Try to free some space for qgroup. * * For qgroup, there are only 3 ways to free qgroup space: * - Flush nodatacow write * Any nodatacow write will free its reserved data space at run_delalloc_range(). * In theory, we should only flush nodatacow inodes, but it's not yet * possible, so we need to flush the whole root. * * - Wait for ordered extents * When ordered extents are finished, their reserved metadata is finally * converted to per_trans status, which can be freed by later commit * transaction. * * - Commit transaction * This would free the meta_per_trans space. * In theory this shouldn't provide much space, but any more qgroup space * is needed. */ static int try_flush_qgroup(struct btrfs_root *root) { struct btrfs_trans_handle *trans; int ret; /* Can't hold an open transaction or we run the risk of deadlocking. */ ASSERT(current->journal_info == NULL); if (WARN_ON(current->journal_info)) return 0; /* * We don't want to run flush again and again, so if there is a running * one, we won't try to start a new flush, but exit directly. */ if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) { wait_event(root->qgroup_flush_wait, !test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)); return 0; } ret = btrfs_start_delalloc_snapshot(root, true); if (ret < 0) goto out; btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1); trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); if (ret == -ENOENT) ret = 0; goto out; } ret = btrfs_commit_transaction(trans); out: clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state); wake_up(&root->qgroup_flush_wait); return ret; } static int qgroup_reserve_data(struct btrfs_inode *inode, struct extent_changeset **reserved_ret, u64 start, u64 len) { struct btrfs_root *root = inode->root; struct extent_changeset *reserved; bool new_reserved = false; u64 orig_reserved; u64 to_reserve; int ret; if (btrfs_qgroup_mode(root->fs_info) == BTRFS_QGROUP_MODE_DISABLED || !is_fstree(root->root_key.objectid) || len == 0) return 0; /* @reserved parameter is mandatory for qgroup */ if (WARN_ON(!reserved_ret)) return -EINVAL; if (!*reserved_ret) { new_reserved = true; *reserved_ret = extent_changeset_alloc(); if (!*reserved_ret) return -ENOMEM; } reserved = *reserved_ret; /* Record already reserved space */ orig_reserved = reserved->bytes_changed; ret = set_record_extent_bits(&inode->io_tree, start, start + len -1, EXTENT_QGROUP_RESERVED, reserved); /* Newly reserved space */ to_reserve = reserved->bytes_changed - orig_reserved; trace_btrfs_qgroup_reserve_data(&inode->vfs_inode, start, len, to_reserve, QGROUP_RESERVE); if (ret < 0) goto out; ret = qgroup_reserve(root, to_reserve, true, BTRFS_QGROUP_RSV_DATA); if (ret < 0) goto cleanup; return ret; cleanup: qgroup_unreserve_range(inode, reserved, start, len); out: if (new_reserved) { extent_changeset_free(reserved); *reserved_ret = NULL; } return ret; } /* * Reserve qgroup space for range [start, start + len). * * This function will either reserve space from related qgroups or do nothing * if the range is already reserved. * * Return 0 for successful reservation * Return <0 for error (including -EQUOT) * * NOTE: This function may sleep for memory allocation, dirty page flushing and * commit transaction. So caller should not hold any dirty page locked. */ int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, struct extent_changeset **reserved_ret, u64 start, u64 len) { int ret; ret = qgroup_reserve_data(inode, reserved_ret, start, len); if (ret <= 0 && ret != -EDQUOT) return ret; ret = try_flush_qgroup(inode->root); if (ret < 0) return ret; return qgroup_reserve_data(inode, reserved_ret, start, len); } /* Free ranges specified by @reserved, normally in error path */ static int qgroup_free_reserved_data(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, u64 len, u64 *freed_ret) { struct btrfs_root *root = inode->root; struct ulist_node *unode; struct ulist_iterator uiter; struct extent_changeset changeset; u64 freed = 0; int ret; extent_changeset_init(&changeset); len = round_up(start + len, root->fs_info->sectorsize); start = round_down(start, root->fs_info->sectorsize); ULIST_ITER_INIT(&uiter); while ((unode = ulist_next(&reserved->range_changed, &uiter))) { u64 range_start = unode->val; /* unode->aux is the inclusive end */ u64 range_len = unode->aux - range_start + 1; u64 free_start; u64 free_len; extent_changeset_release(&changeset); /* Only free range in range [start, start + len) */ if (range_start >= start + len || range_start + range_len <= start) continue; free_start = max(range_start, start); free_len = min(start + len, range_start + range_len) - free_start; /* * TODO: To also modify reserved->ranges_reserved to reflect * the modification. * * However as long as we free qgroup reserved according to * EXTENT_QGROUP_RESERVED, we won't double free. * So not need to rush. */ ret = clear_record_extent_bits(&inode->io_tree, free_start, free_start + free_len - 1, EXTENT_QGROUP_RESERVED, &changeset); if (ret < 0) goto out; freed += changeset.bytes_changed; } btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, freed, BTRFS_QGROUP_RSV_DATA); if (freed_ret) *freed_ret = freed; ret = 0; out: extent_changeset_release(&changeset); return ret; } static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, u64 len, u64 *released, int free) { struct extent_changeset changeset; int trace_op = QGROUP_RELEASE; int ret; if (btrfs_qgroup_mode(inode->root->fs_info) == BTRFS_QGROUP_MODE_DISABLED) { extent_changeset_init(&changeset); return clear_record_extent_bits(&inode->io_tree, start, start + len - 1, EXTENT_QGROUP_RESERVED, &changeset); } /* In release case, we shouldn't have @reserved */ WARN_ON(!free && reserved); if (free && reserved) return qgroup_free_reserved_data(inode, reserved, start, len, released); extent_changeset_init(&changeset); ret = clear_record_extent_bits(&inode->io_tree, start, start + len -1, EXTENT_QGROUP_RESERVED, &changeset); if (ret < 0) goto out; if (free) trace_op = QGROUP_FREE; trace_btrfs_qgroup_release_data(&inode->vfs_inode, start, len, changeset.bytes_changed, trace_op); if (free) btrfs_qgroup_free_refroot(inode->root->fs_info, inode->root->root_key.objectid, changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA); if (released) *released = changeset.bytes_changed; out: extent_changeset_release(&changeset); return ret; } /* * Free a reserved space range from io_tree and related qgroups * * Should be called when a range of pages get invalidated before reaching disk. * Or for error cleanup case. * if @reserved is given, only reserved range in [@start, @start + @len) will * be freed. * * For data written to disk, use btrfs_qgroup_release_data(). * * NOTE: This function may sleep for memory allocation. */ int btrfs_qgroup_free_data(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, u64 len, u64 *freed) { return __btrfs_qgroup_release_data(inode, reserved, start, len, freed, 1); } /* * Release a reserved space range from io_tree only. * * Should be called when a range of pages get written to disk and corresponding * FILE_EXTENT is inserted into corresponding root. * * Since new qgroup accounting framework will only update qgroup numbers at * commit_transaction() time, its reserved space shouldn't be freed from * related qgroups. * * But we should release the range from io_tree, to allow further write to be * COWed. * * NOTE: This function may sleep for memory allocation. */ int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len, u64 *released) { return __btrfs_qgroup_release_data(inode, NULL, start, len, released, 0); } static void add_root_meta_rsv(struct btrfs_root *root, int num_bytes, enum btrfs_qgroup_rsv_type type) { if (type != BTRFS_QGROUP_RSV_META_PREALLOC && type != BTRFS_QGROUP_RSV_META_PERTRANS) return; if (num_bytes == 0) return; spin_lock(&root->qgroup_meta_rsv_lock); if (type == BTRFS_QGROUP_RSV_META_PREALLOC) root->qgroup_meta_rsv_prealloc += num_bytes; else root->qgroup_meta_rsv_pertrans += num_bytes; spin_unlock(&root->qgroup_meta_rsv_lock); } static int sub_root_meta_rsv(struct btrfs_root *root, int num_bytes, enum btrfs_qgroup_rsv_type type) { if (type != BTRFS_QGROUP_RSV_META_PREALLOC && type != BTRFS_QGROUP_RSV_META_PERTRANS) return 0; if (num_bytes == 0) return 0; spin_lock(&root->qgroup_meta_rsv_lock); if (type == BTRFS_QGROUP_RSV_META_PREALLOC) { num_bytes = min_t(u64, root->qgroup_meta_rsv_prealloc, num_bytes); root->qgroup_meta_rsv_prealloc -= num_bytes; } else { num_bytes = min_t(u64, root->qgroup_meta_rsv_pertrans, num_bytes); root->qgroup_meta_rsv_pertrans -= num_bytes; } spin_unlock(&root->qgroup_meta_rsv_lock); return num_bytes; } int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, enum btrfs_qgroup_rsv_type type, bool enforce) { struct btrfs_fs_info *fs_info = root->fs_info; int ret; if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || !is_fstree(root->root_key.objectid) || num_bytes == 0) return 0; BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); trace_qgroup_meta_reserve(root, (s64)num_bytes, type); ret = qgroup_reserve(root, num_bytes, enforce, type); if (ret < 0) return ret; /* * Record what we have reserved into root. * * To avoid quota disabled->enabled underflow. * In that case, we may try to free space we haven't reserved * (since quota was disabled), so record what we reserved into root. * And ensure later release won't underflow this number. */ add_root_meta_rsv(root, num_bytes, type); return ret; } int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, enum btrfs_qgroup_rsv_type type, bool enforce, bool noflush) { int ret; ret = btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce); if ((ret <= 0 && ret != -EDQUOT) || noflush) return ret; ret = try_flush_qgroup(root); if (ret < 0) return ret; return btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce); } /* * Per-transaction meta reservation should be all freed at transaction commit * time */ void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || !is_fstree(root->root_key.objectid)) return; /* TODO: Update trace point to handle such free */ trace_qgroup_meta_free_all_pertrans(root); /* Special value -1 means to free all reserved space */ btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, (u64)-1, BTRFS_QGROUP_RSV_META_PERTRANS); } void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, enum btrfs_qgroup_rsv_type type) { struct btrfs_fs_info *fs_info = root->fs_info; if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || !is_fstree(root->root_key.objectid)) return; /* * reservation for META_PREALLOC can happen before quota is enabled, * which can lead to underflow. * Here ensure we will only free what we really have reserved. */ num_bytes = sub_root_meta_rsv(root, num_bytes, type); BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); trace_qgroup_meta_reserve(root, -(s64)num_bytes, type); btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, num_bytes, type); } static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root, int num_bytes) { struct btrfs_qgroup *qgroup; LIST_HEAD(qgroup_list); if (num_bytes == 0) return; if (!fs_info->quota_root) return; spin_lock(&fs_info->qgroup_lock); qgroup = find_qgroup_rb(fs_info, ref_root); if (!qgroup) goto out; qgroup_iterator_add(&qgroup_list, qgroup); list_for_each_entry(qgroup, &qgroup_list, iterator) { struct btrfs_qgroup_list *glist; qgroup_rsv_release(fs_info, qgroup, num_bytes, BTRFS_QGROUP_RSV_META_PREALLOC); if (!sb_rdonly(fs_info->sb)) qgroup_rsv_add(fs_info, qgroup, num_bytes, BTRFS_QGROUP_RSV_META_PERTRANS); list_for_each_entry(glist, &qgroup->groups, next_group) qgroup_iterator_add(&qgroup_list, glist->group); } out: qgroup_iterator_clean(&qgroup_list); spin_unlock(&fs_info->qgroup_lock); } /* * Convert @num_bytes of META_PREALLOCATED reservation to META_PERTRANS. * * This is called when preallocated meta reservation needs to be used. * Normally after btrfs_join_transaction() call. */ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes) { struct btrfs_fs_info *fs_info = root->fs_info; if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || !is_fstree(root->root_key.objectid)) return; /* Same as btrfs_qgroup_free_meta_prealloc() */ num_bytes = sub_root_meta_rsv(root, num_bytes, BTRFS_QGROUP_RSV_META_PREALLOC); trace_qgroup_meta_convert(root, num_bytes); qgroup_convert_meta(fs_info, root->root_key.objectid, num_bytes); } /* * Check qgroup reserved space leaking, normally at destroy inode * time */ void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode) { struct extent_changeset changeset; struct ulist_node *unode; struct ulist_iterator iter; int ret; extent_changeset_init(&changeset); ret = clear_record_extent_bits(&inode->io_tree, 0, (u64)-1, EXTENT_QGROUP_RESERVED, &changeset); WARN_ON(ret < 0); if (WARN_ON(changeset.bytes_changed)) { ULIST_ITER_INIT(&iter); while ((unode = ulist_next(&changeset.range_changed, &iter))) { btrfs_warn(inode->root->fs_info, "leaking qgroup reserved space, ino: %llu, start: %llu, end: %llu", btrfs_ino(inode), unode->val, unode->aux); } btrfs_qgroup_free_refroot(inode->root->fs_info, inode->root->root_key.objectid, changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA); } extent_changeset_release(&changeset); } void btrfs_qgroup_init_swapped_blocks( struct btrfs_qgroup_swapped_blocks *swapped_blocks) { int i; spin_lock_init(&swapped_blocks->lock); for (i = 0; i < BTRFS_MAX_LEVEL; i++) swapped_blocks->blocks[i] = RB_ROOT; swapped_blocks->swapped = false; } /* * Delete all swapped blocks record of @root. * Every record here means we skipped a full subtree scan for qgroup. * * Gets called when committing one transaction. */ void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root) { struct btrfs_qgroup_swapped_blocks *swapped_blocks; int i; swapped_blocks = &root->swapped_blocks; spin_lock(&swapped_blocks->lock); if (!swapped_blocks->swapped) goto out; for (i = 0; i < BTRFS_MAX_LEVEL; i++) { struct rb_root *cur_root = &swapped_blocks->blocks[i]; struct btrfs_qgroup_swapped_block *entry; struct btrfs_qgroup_swapped_block *next; rbtree_postorder_for_each_entry_safe(entry, next, cur_root, node) kfree(entry); swapped_blocks->blocks[i] = RB_ROOT; } swapped_blocks->swapped = false; out: spin_unlock(&swapped_blocks->lock); } /* * Add subtree roots record into @subvol_root. * * @subvol_root: tree root of the subvolume tree get swapped * @bg: block group under balance * @subvol_parent/slot: pointer to the subtree root in subvolume tree * @reloc_parent/slot: pointer to the subtree root in reloc tree * BOTH POINTERS ARE BEFORE TREE SWAP * @last_snapshot: last snapshot generation of the subvolume tree */ int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *subvol_root, struct btrfs_block_group *bg, struct extent_buffer *subvol_parent, int subvol_slot, struct extent_buffer *reloc_parent, int reloc_slot, u64 last_snapshot) { struct btrfs_fs_info *fs_info = subvol_root->fs_info; struct btrfs_qgroup_swapped_blocks *blocks = &subvol_root->swapped_blocks; struct btrfs_qgroup_swapped_block *block; struct rb_node **cur; struct rb_node *parent = NULL; int level = btrfs_header_level(subvol_parent) - 1; int ret = 0; if (!btrfs_qgroup_full_accounting(fs_info)) return 0; if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) > btrfs_node_ptr_generation(reloc_parent, reloc_slot)) { btrfs_err_rl(fs_info, "%s: bad parameter order, subvol_gen=%llu reloc_gen=%llu", __func__, btrfs_node_ptr_generation(subvol_parent, subvol_slot), btrfs_node_ptr_generation(reloc_parent, reloc_slot)); return -EUCLEAN; } block = kmalloc(sizeof(*block), GFP_NOFS); if (!block) { ret = -ENOMEM; goto out; } /* * @reloc_parent/slot is still before swap, while @block is going to * record the bytenr after swap, so we do the swap here. */ block->subvol_bytenr = btrfs_node_blockptr(reloc_parent, reloc_slot); block->subvol_generation = btrfs_node_ptr_generation(reloc_parent, reloc_slot); block->reloc_bytenr = btrfs_node_blockptr(subvol_parent, subvol_slot); block->reloc_generation = btrfs_node_ptr_generation(subvol_parent, subvol_slot); block->last_snapshot = last_snapshot; block->level = level; /* * If we have bg == NULL, we're called from btrfs_recover_relocation(), * no one else can modify tree blocks thus we qgroup will not change * no matter the value of trace_leaf. */ if (bg && bg->flags & BTRFS_BLOCK_GROUP_DATA) block->trace_leaf = true; else block->trace_leaf = false; btrfs_node_key_to_cpu(reloc_parent, &block->first_key, reloc_slot); /* Insert @block into @blocks */ spin_lock(&blocks->lock); cur = &blocks->blocks[level].rb_node; while (*cur) { struct btrfs_qgroup_swapped_block *entry; parent = *cur; entry = rb_entry(parent, struct btrfs_qgroup_swapped_block, node); if (entry->subvol_bytenr < block->subvol_bytenr) { cur = &(*cur)->rb_left; } else if (entry->subvol_bytenr > block->subvol_bytenr) { cur = &(*cur)->rb_right; } else { if (entry->subvol_generation != block->subvol_generation || entry->reloc_bytenr != block->reloc_bytenr || entry->reloc_generation != block->reloc_generation) { /* * Duplicated but mismatch entry found. * Shouldn't happen. * * Marking qgroup inconsistent should be enough * for end users. */ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); ret = -EEXIST; } kfree(block); goto out_unlock; } } rb_link_node(&block->node, parent, cur); rb_insert_color(&block->node, &blocks->blocks[level]); blocks->swapped = true; out_unlock: spin_unlock(&blocks->lock); out: if (ret < 0) qgroup_mark_inconsistent(fs_info); return ret; } /* * Check if the tree block is a subtree root, and if so do the needed * delayed subtree trace for qgroup. * * This is called during btrfs_cow_block(). */ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *subvol_eb) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_tree_parent_check check = { 0 }; struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks; struct btrfs_qgroup_swapped_block *block; struct extent_buffer *reloc_eb = NULL; struct rb_node *node; bool found = false; bool swapped = false; int level = btrfs_header_level(subvol_eb); int ret = 0; int i; if (!btrfs_qgroup_full_accounting(fs_info)) return 0; if (!is_fstree(root->root_key.objectid) || !root->reloc_root) return 0; spin_lock(&blocks->lock); if (!blocks->swapped) { spin_unlock(&blocks->lock); return 0; } node = blocks->blocks[level].rb_node; while (node) { block = rb_entry(node, struct btrfs_qgroup_swapped_block, node); if (block->subvol_bytenr < subvol_eb->start) { node = node->rb_left; } else if (block->subvol_bytenr > subvol_eb->start) { node = node->rb_right; } else { found = true; break; } } if (!found) { spin_unlock(&blocks->lock); goto out; } /* Found one, remove it from @blocks first and update blocks->swapped */ rb_erase(&block->node, &blocks->blocks[level]); for (i = 0; i < BTRFS_MAX_LEVEL; i++) { if (RB_EMPTY_ROOT(&blocks->blocks[i])) { swapped = true; break; } } blocks->swapped = swapped; spin_unlock(&blocks->lock); check.level = block->level; check.transid = block->reloc_generation; check.has_first_key = true; memcpy(&check.first_key, &block->first_key, sizeof(check.first_key)); /* Read out reloc subtree root */ reloc_eb = read_tree_block(fs_info, block->reloc_bytenr, &check); if (IS_ERR(reloc_eb)) { ret = PTR_ERR(reloc_eb); reloc_eb = NULL; goto free_out; } if (!extent_buffer_uptodate(reloc_eb)) { ret = -EIO; goto free_out; } ret = qgroup_trace_subtree_swap(trans, reloc_eb, subvol_eb, block->last_snapshot, block->trace_leaf); free_out: kfree(block); free_extent_buffer(reloc_eb); out: if (ret < 0) { btrfs_err_rl(fs_info, "failed to account subtree at bytenr %llu: %d", subvol_eb->start, ret); qgroup_mark_inconsistent(fs_info); } return ret; } void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans) { struct btrfs_qgroup_extent_record *entry; struct btrfs_qgroup_extent_record *next; struct rb_root *root; root = &trans->delayed_refs.dirty_extent_root; rbtree_postorder_for_each_entry_safe(entry, next, root, node) { ulist_free(entry->old_roots); kfree(entry); } *root = RB_ROOT; } void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_bytes) { if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE) return; if (!is_fstree(root)) return; btrfs_qgroup_free_refroot(fs_info, root, rsv_bytes, BTRFS_QGROUP_RSV_DATA); } int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info, struct btrfs_squota_delta *delta) { int ret; struct btrfs_qgroup *qgroup; struct btrfs_qgroup *qg; LIST_HEAD(qgroup_list); u64 root = delta->root; u64 num_bytes = delta->num_bytes; const int sign = (delta->is_inc ? 1 : -1); if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE) return 0; if (!is_fstree(root)) return 0; /* If the extent predates enabling quotas, don't count it. */ if (delta->generation < fs_info->qgroup_enable_gen) return 0; spin_lock(&fs_info->qgroup_lock); qgroup = find_qgroup_rb(fs_info, root); if (!qgroup) { ret = -ENOENT; goto out; } ret = 0; qgroup_iterator_add(&qgroup_list, qgroup); list_for_each_entry(qg, &qgroup_list, iterator) { struct btrfs_qgroup_list *glist; qg->excl += num_bytes * sign; qg->rfer += num_bytes * sign; qgroup_dirty(fs_info, qg); list_for_each_entry(glist, &qg->groups, next_group) qgroup_iterator_add(&qgroup_list, glist->group); } qgroup_iterator_clean(&qgroup_list); out: spin_unlock(&fs_info->qgroup_lock); return ret; } |
8 7 8 8 7 8 13 19 20 12 4 24 24 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 | /* * Linear conversion Plug-In * Copyright (c) 1999 by Jaroslav Kysela <perex@perex.cz>, * Abramo Bagnara <abramo@alsa-project.org> * * * This library is free software; you can redistribute it and/or modify * it under the terms of the GNU Library General Public License as * published by the Free Software Foundation; either version 2 of * the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Library General Public License for more details. * * You should have received a copy of the GNU Library General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * */ #include <linux/time.h> #include <sound/core.h> #include <sound/pcm.h> #include "pcm_plugin.h" /* * Basic linear conversion plugin */ struct linear_priv { int cvt_endian; /* need endian conversion? */ unsigned int src_ofs; /* byte offset in source format */ unsigned int dst_ofs; /* byte soffset in destination format */ unsigned int copy_ofs; /* byte offset in temporary u32 data */ unsigned int dst_bytes; /* byte size of destination format */ unsigned int copy_bytes; /* bytes to copy per conversion */ unsigned int flip; /* MSB flip for signeness, done after endian conv */ }; static inline void do_convert(struct linear_priv *data, unsigned char *dst, unsigned char *src) { unsigned int tmp = 0; unsigned char *p = (unsigned char *)&tmp; memcpy(p + data->copy_ofs, src + data->src_ofs, data->copy_bytes); if (data->cvt_endian) tmp = swab32(tmp); tmp ^= data->flip; memcpy(dst, p + data->dst_ofs, data->dst_bytes); } static void convert(struct snd_pcm_plugin *plugin, const struct snd_pcm_plugin_channel *src_channels, struct snd_pcm_plugin_channel *dst_channels, snd_pcm_uframes_t frames) { struct linear_priv *data = (struct linear_priv *)plugin->extra_data; int channel; int nchannels = plugin->src_format.channels; for (channel = 0; channel < nchannels; ++channel) { char *src; char *dst; int src_step, dst_step; snd_pcm_uframes_t frames1; if (!src_channels[channel].enabled) { if (dst_channels[channel].wanted) snd_pcm_area_silence(&dst_channels[channel].area, 0, frames, plugin->dst_format.format); dst_channels[channel].enabled = 0; continue; } dst_channels[channel].enabled = 1; src = src_channels[channel].area.addr + src_channels[channel].area.first / 8; dst = dst_channels[channel].area.addr + dst_channels[channel].area.first / 8; src_step = src_channels[channel].area.step / 8; dst_step = dst_channels[channel].area.step / 8; frames1 = frames; while (frames1-- > 0) { do_convert(data, dst, src); src += src_step; dst += dst_step; } } } static snd_pcm_sframes_t linear_transfer(struct snd_pcm_plugin *plugin, const struct snd_pcm_plugin_channel *src_channels, struct snd_pcm_plugin_channel *dst_channels, snd_pcm_uframes_t frames) { if (snd_BUG_ON(!plugin || !src_channels || !dst_channels)) return -ENXIO; if (frames == 0) return 0; #ifdef CONFIG_SND_DEBUG { unsigned int channel; for (channel = 0; channel < plugin->src_format.channels; channel++) { if (snd_BUG_ON(src_channels[channel].area.first % 8 || src_channels[channel].area.step % 8)) return -ENXIO; if (snd_BUG_ON(dst_channels[channel].area.first % 8 || dst_channels[channel].area.step % 8)) return -ENXIO; } } #endif if (frames > dst_channels[0].frames) frames = dst_channels[0].frames; convert(plugin, src_channels, dst_channels, frames); return frames; } static void init_data(struct linear_priv *data, snd_pcm_format_t src_format, snd_pcm_format_t dst_format) { int src_le, dst_le, src_bytes, dst_bytes; src_bytes = snd_pcm_format_width(src_format) / 8; dst_bytes = snd_pcm_format_width(dst_format) / 8; src_le = snd_pcm_format_little_endian(src_format) > 0; dst_le = snd_pcm_format_little_endian(dst_format) > 0; data->dst_bytes = dst_bytes; data->cvt_endian = src_le != dst_le; data->copy_bytes = src_bytes < dst_bytes ? src_bytes : dst_bytes; if (src_le) { data->copy_ofs = 4 - data->copy_bytes; data->src_ofs = src_bytes - data->copy_bytes; } else data->src_ofs = snd_pcm_format_physical_width(src_format) / 8 - src_bytes; if (dst_le) data->dst_ofs = 4 - data->dst_bytes; else data->dst_ofs = snd_pcm_format_physical_width(dst_format) / 8 - dst_bytes; if (snd_pcm_format_signed(src_format) != snd_pcm_format_signed(dst_format)) { if (dst_le) data->flip = (__force u32)cpu_to_le32(0x80000000); else data->flip = (__force u32)cpu_to_be32(0x80000000); } } int snd_pcm_plugin_build_linear(struct snd_pcm_substream *plug, struct snd_pcm_plugin_format *src_format, struct snd_pcm_plugin_format *dst_format, struct snd_pcm_plugin **r_plugin) { int err; struct linear_priv *data; struct snd_pcm_plugin *plugin; if (snd_BUG_ON(!r_plugin)) return -ENXIO; *r_plugin = NULL; if (snd_BUG_ON(src_format->rate != dst_format->rate)) return -ENXIO; if (snd_BUG_ON(src_format->channels != dst_format->channels)) return -ENXIO; if (snd_BUG_ON(!snd_pcm_format_linear(src_format->format) || !snd_pcm_format_linear(dst_format->format))) return -ENXIO; err = snd_pcm_plugin_build(plug, "linear format conversion", src_format, dst_format, sizeof(struct linear_priv), &plugin); if (err < 0) return err; data = (struct linear_priv *)plugin->extra_data; init_data(data, src_format->format, dst_format->format); plugin->transfer = linear_transfer; *r_plugin = plugin; return 0; } |
96 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Count leading and trailing zeros functions * * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #ifndef _LINUX_BITOPS_COUNT_ZEROS_H_ #define _LINUX_BITOPS_COUNT_ZEROS_H_ #include <asm/bitops.h> /** * count_leading_zeros - Count the number of zeros from the MSB back * @x: The value * * Count the number of leading zeros from the MSB going towards the LSB in @x. * * If the MSB of @x is set, the result is 0. * If only the LSB of @x is set, then the result is BITS_PER_LONG-1. * If @x is 0 then the result is COUNT_LEADING_ZEROS_0. */ static inline int count_leading_zeros(unsigned long x) { if (sizeof(x) == 4) return BITS_PER_LONG - fls(x); else return BITS_PER_LONG - fls64(x); } #define COUNT_LEADING_ZEROS_0 BITS_PER_LONG /** * count_trailing_zeros - Count the number of zeros from the LSB forwards * @x: The value * * Count the number of trailing zeros from the LSB going towards the MSB in @x. * * If the LSB of @x is set, the result is 0. * If only the MSB of @x is set, then the result is BITS_PER_LONG-1. * If @x is 0 then the result is COUNT_TRAILING_ZEROS_0. */ static inline int count_trailing_zeros(unsigned long x) { #define COUNT_TRAILING_ZEROS_0 (-1) if (sizeof(x) == 4) return ffs(x); else return (x != 0) ? __ffs(x) : COUNT_TRAILING_ZEROS_0; } #endif /* _LINUX_BITOPS_COUNT_ZEROS_H_ */ |
1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2008, Christoph Hellwig * All Rights Reserved. */ #include "xfs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_trace.h" #include "xfs_error.h" #include "xfs_acl.h" #include "xfs_trans.h" #include "xfs_xattr.h" #include <linux/posix_acl_xattr.h> /* * Locking scheme: * - all ACL updates are protected by inode->i_mutex, which is taken before * calling into this file. */ STATIC struct posix_acl * xfs_acl_from_disk( struct xfs_mount *mp, const struct xfs_acl *aclp, int len, int max_entries) { struct posix_acl_entry *acl_e; struct posix_acl *acl; const struct xfs_acl_entry *ace; unsigned int count, i; if (len < sizeof(*aclp)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, aclp, len); return ERR_PTR(-EFSCORRUPTED); } count = be32_to_cpu(aclp->acl_cnt); if (count > max_entries || XFS_ACL_SIZE(count) != len) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, aclp, len); return ERR_PTR(-EFSCORRUPTED); } acl = posix_acl_alloc(count, GFP_KERNEL); if (!acl) return ERR_PTR(-ENOMEM); for (i = 0; i < count; i++) { acl_e = &acl->a_entries[i]; ace = &aclp->acl_entry[i]; /* * The tag is 32 bits on disk and 16 bits in core. * * Because every access to it goes through the core * format first this is not a problem. */ acl_e->e_tag = be32_to_cpu(ace->ae_tag); acl_e->e_perm = be16_to_cpu(ace->ae_perm); switch (acl_e->e_tag) { case ACL_USER: acl_e->e_uid = make_kuid(&init_user_ns, be32_to_cpu(ace->ae_id)); break; case ACL_GROUP: acl_e->e_gid = make_kgid(&init_user_ns, be32_to_cpu(ace->ae_id)); break; case ACL_USER_OBJ: case ACL_GROUP_OBJ: case ACL_MASK: case ACL_OTHER: break; default: goto fail; } } return acl; fail: posix_acl_release(acl); return ERR_PTR(-EINVAL); } STATIC void xfs_acl_to_disk(struct xfs_acl *aclp, const struct posix_acl *acl) { const struct posix_acl_entry *acl_e; struct xfs_acl_entry *ace; int i; aclp->acl_cnt = cpu_to_be32(acl->a_count); for (i = 0; i < acl->a_count; i++) { ace = &aclp->acl_entry[i]; acl_e = &acl->a_entries[i]; ace->ae_tag = cpu_to_be32(acl_e->e_tag); switch (acl_e->e_tag) { case ACL_USER: ace->ae_id = cpu_to_be32( from_kuid(&init_user_ns, acl_e->e_uid)); break; case ACL_GROUP: ace->ae_id = cpu_to_be32( from_kgid(&init_user_ns, acl_e->e_gid)); break; default: ace->ae_id = cpu_to_be32(ACL_UNDEFINED_ID); break; } ace->ae_perm = cpu_to_be16(acl_e->e_perm); } } struct posix_acl * xfs_get_acl(struct inode *inode, int type, bool rcu) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; struct posix_acl *acl = NULL; struct xfs_da_args args = { .dp = ip, .attr_filter = XFS_ATTR_ROOT, .valuelen = XFS_ACL_MAX_SIZE(mp), }; int error; if (rcu) return ERR_PTR(-ECHILD); trace_xfs_get_acl(ip); switch (type) { case ACL_TYPE_ACCESS: args.name = SGI_ACL_FILE; break; case ACL_TYPE_DEFAULT: args.name = SGI_ACL_DEFAULT; break; default: BUG(); } args.namelen = strlen(args.name); /* * If the attribute doesn't exist make sure we have a negative cache * entry, for any other error assume it is transient. */ error = xfs_attr_get(&args); if (!error) { acl = xfs_acl_from_disk(mp, args.value, args.valuelen, XFS_ACL_MAX_ENTRIES(mp)); } else if (error != -ENOATTR) { acl = ERR_PTR(error); } kmem_free(args.value); return acl; } int __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) { struct xfs_inode *ip = XFS_I(inode); struct xfs_da_args args = { .dp = ip, .attr_filter = XFS_ATTR_ROOT, }; int error; switch (type) { case ACL_TYPE_ACCESS: args.name = SGI_ACL_FILE; break; case ACL_TYPE_DEFAULT: if (!S_ISDIR(inode->i_mode)) return acl ? -EACCES : 0; args.name = SGI_ACL_DEFAULT; break; default: return -EINVAL; } args.namelen = strlen(args.name); if (acl) { args.valuelen = XFS_ACL_SIZE(acl->a_count); args.value = kvzalloc(args.valuelen, GFP_KERNEL); if (!args.value) return -ENOMEM; xfs_acl_to_disk(args.value, acl); } error = xfs_attr_change(&args); kmem_free(args.value); /* * If the attribute didn't exist to start with that's fine. */ if (!acl && error == -ENOATTR) error = 0; if (!error) set_cached_acl(inode, type, acl); return error; } static int xfs_acl_set_mode( struct inode *inode, umode_t mode) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; int error; error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); if (error) return error; xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); inode->i_mode = mode; inode_set_ctime_current(inode); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); if (xfs_has_wsync(mp)) xfs_trans_set_sync(tp); return xfs_trans_commit(tp); } int xfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { umode_t mode; bool set_mode = false; int error = 0; struct inode *inode = d_inode(dentry); if (!acl) goto set_acl; error = -E2BIG; if (acl->a_count > XFS_ACL_MAX_ENTRIES(XFS_M(inode->i_sb))) return error; if (type == ACL_TYPE_ACCESS) { error = posix_acl_update_mode(idmap, inode, &mode, &acl); if (error) return error; set_mode = true; } set_acl: /* * We set the mode after successfully updating the ACL xattr because the * xattr update can fail at ENOSPC and we don't want to change the mode * if the ACL update hasn't been applied. */ error = __xfs_set_acl(inode, acl, type); if (!error && set_mode && mode != inode->i_mode) error = xfs_acl_set_mode(inode, mode); return error; } /* * Invalidate any cached ACLs if the user has bypassed the ACL interface. * We don't validate the content whatsoever so it is caller responsibility to * provide data in valid format and ensure i_mode is consistent. */ void xfs_forget_acl( struct inode *inode, const char *name) { if (!strcmp(name, SGI_ACL_FILE)) forget_cached_acl(inode, ACL_TYPE_ACCESS); else if (!strcmp(name, SGI_ACL_DEFAULT)) forget_cached_acl(inode, ACL_TYPE_DEFAULT); } |
1 1 1 1 1 1 1 1 1 1 1 1 3 1 2 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 | // SPDX-License-Identifier: GPL-2.0-only /* CAN driver for Geschwister Schneider USB/CAN devices * and bytewerk.org candleLight USB CAN interfaces. * * Copyright (C) 2013-2016 Geschwister Schneider Technologie-, * Entwicklungs- und Vertriebs UG (Haftungsbeschränkt). * Copyright (C) 2016 Hubert Denkmair * Copyright (c) 2023 Pengutronix, Marc Kleine-Budde <kernel@pengutronix.de> * * Many thanks to all socketcan devs! */ #include <linux/bitfield.h> #include <linux/clocksource.h> #include <linux/ethtool.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/signal.h> #include <linux/timecounter.h> #include <linux/units.h> #include <linux/usb.h> #include <linux/workqueue.h> #include <linux/can.h> #include <linux/can/dev.h> #include <linux/can/error.h> #include <linux/can/rx-offload.h> /* Device specific constants */ #define USB_GS_USB_1_VENDOR_ID 0x1d50 #define USB_GS_USB_1_PRODUCT_ID 0x606f #define USB_CANDLELIGHT_VENDOR_ID 0x1209 #define USB_CANDLELIGHT_PRODUCT_ID 0x2323 #define USB_CES_CANEXT_FD_VENDOR_ID 0x1cd2 #define USB_CES_CANEXT_FD_PRODUCT_ID 0x606f #define USB_ABE_CANDEBUGGER_FD_VENDOR_ID 0x16d0 #define USB_ABE_CANDEBUGGER_FD_PRODUCT_ID 0x10b8 #define GS_USB_ENDPOINT_IN 1 #define GS_USB_ENDPOINT_OUT 2 /* Timestamp 32 bit timer runs at 1 MHz (1 µs tick). Worker accounts * for timer overflow (will be after ~71 minutes) */ #define GS_USB_TIMESTAMP_TIMER_HZ (1 * HZ_PER_MHZ) #define GS_USB_TIMESTAMP_WORK_DELAY_SEC 1800 static_assert(GS_USB_TIMESTAMP_WORK_DELAY_SEC < CYCLECOUNTER_MASK(32) / GS_USB_TIMESTAMP_TIMER_HZ / 2); /* Device specific constants */ enum gs_usb_breq { GS_USB_BREQ_HOST_FORMAT = 0, GS_USB_BREQ_BITTIMING, GS_USB_BREQ_MODE, GS_USB_BREQ_BERR, GS_USB_BREQ_BT_CONST, GS_USB_BREQ_DEVICE_CONFIG, GS_USB_BREQ_TIMESTAMP, GS_USB_BREQ_IDENTIFY, GS_USB_BREQ_GET_USER_ID, GS_USB_BREQ_QUIRK_CANTACT_PRO_DATA_BITTIMING = GS_USB_BREQ_GET_USER_ID, GS_USB_BREQ_SET_USER_ID, GS_USB_BREQ_DATA_BITTIMING, GS_USB_BREQ_BT_CONST_EXT, GS_USB_BREQ_SET_TERMINATION, GS_USB_BREQ_GET_TERMINATION, GS_USB_BREQ_GET_STATE, }; enum gs_can_mode { /* reset a channel. turns it off */ GS_CAN_MODE_RESET = 0, /* starts a channel */ GS_CAN_MODE_START }; enum gs_can_state { GS_CAN_STATE_ERROR_ACTIVE = 0, GS_CAN_STATE_ERROR_WARNING, GS_CAN_STATE_ERROR_PASSIVE, GS_CAN_STATE_BUS_OFF, GS_CAN_STATE_STOPPED, GS_CAN_STATE_SLEEPING }; enum gs_can_identify_mode { GS_CAN_IDENTIFY_OFF = 0, GS_CAN_IDENTIFY_ON }; enum gs_can_termination_state { GS_CAN_TERMINATION_STATE_OFF = 0, GS_CAN_TERMINATION_STATE_ON }; #define GS_USB_TERMINATION_DISABLED CAN_TERMINATION_DISABLED #define GS_USB_TERMINATION_ENABLED 120 /* data types passed between host and device */ /* The firmware on the original USB2CAN by Geschwister Schneider * Technologie Entwicklungs- und Vertriebs UG exchanges all data * between the host and the device in host byte order. This is done * with the struct gs_host_config::byte_order member, which is sent * first to indicate the desired byte order. * * The widely used open source firmware candleLight doesn't support * this feature and exchanges the data in little endian byte order. */ struct gs_host_config { __le32 byte_order; } __packed; struct gs_device_config { u8 reserved1; u8 reserved2; u8 reserved3; u8 icount; __le32 sw_version; __le32 hw_version; } __packed; #define GS_CAN_MODE_NORMAL 0 #define GS_CAN_MODE_LISTEN_ONLY BIT(0) #define GS_CAN_MODE_LOOP_BACK BIT(1) #define GS_CAN_MODE_TRIPLE_SAMPLE BIT(2) #define GS_CAN_MODE_ONE_SHOT BIT(3) #define GS_CAN_MODE_HW_TIMESTAMP BIT(4) /* GS_CAN_FEATURE_IDENTIFY BIT(5) */ /* GS_CAN_FEATURE_USER_ID BIT(6) */ #define GS_CAN_MODE_PAD_PKTS_TO_MAX_PKT_SIZE BIT(7) #define GS_CAN_MODE_FD BIT(8) /* GS_CAN_FEATURE_REQ_USB_QUIRK_LPC546XX BIT(9) */ /* GS_CAN_FEATURE_BT_CONST_EXT BIT(10) */ /* GS_CAN_FEATURE_TERMINATION BIT(11) */ #define GS_CAN_MODE_BERR_REPORTING BIT(12) /* GS_CAN_FEATURE_GET_STATE BIT(13) */ struct gs_device_mode { __le32 mode; __le32 flags; } __packed; struct gs_device_state { __le32 state; __le32 rxerr; __le32 txerr; } __packed; struct gs_device_bittiming { __le32 prop_seg; __le32 phase_seg1; __le32 phase_seg2; __le32 sjw; __le32 brp; } __packed; struct gs_identify_mode { __le32 mode; } __packed; struct gs_device_termination_state { __le32 state; } __packed; #define GS_CAN_FEATURE_LISTEN_ONLY BIT(0) #define GS_CAN_FEATURE_LOOP_BACK BIT(1) #define GS_CAN_FEATURE_TRIPLE_SAMPLE BIT(2) #define GS_CAN_FEATURE_ONE_SHOT BIT(3) #define GS_CAN_FEATURE_HW_TIMESTAMP BIT(4) #define GS_CAN_FEATURE_IDENTIFY BIT(5) #define GS_CAN_FEATURE_USER_ID BIT(6) #define GS_CAN_FEATURE_PAD_PKTS_TO_MAX_PKT_SIZE BIT(7) #define GS_CAN_FEATURE_FD BIT(8) #define GS_CAN_FEATURE_REQ_USB_QUIRK_LPC546XX BIT(9) #define GS_CAN_FEATURE_BT_CONST_EXT BIT(10) #define GS_CAN_FEATURE_TERMINATION BIT(11) #define GS_CAN_FEATURE_BERR_REPORTING BIT(12) #define GS_CAN_FEATURE_GET_STATE BIT(13) #define GS_CAN_FEATURE_MASK GENMASK(13, 0) /* internal quirks - keep in GS_CAN_FEATURE space for now */ /* CANtact Pro original firmware: * BREQ DATA_BITTIMING overlaps with GET_USER_ID */ #define GS_CAN_FEATURE_QUIRK_BREQ_CANTACT_PRO BIT(31) struct gs_device_bt_const { __le32 feature; __le32 fclk_can; __le32 tseg1_min; __le32 tseg1_max; __le32 tseg2_min; __le32 tseg2_max; __le32 sjw_max; __le32 brp_min; __le32 brp_max; __le32 brp_inc; } __packed; struct gs_device_bt_const_extended { __le32 feature; __le32 fclk_can; __le32 tseg1_min; __le32 tseg1_max; __le32 tseg2_min; __le32 tseg2_max; __le32 sjw_max; __le32 brp_min; __le32 brp_max; __le32 brp_inc; __le32 dtseg1_min; __le32 dtseg1_max; __le32 dtseg2_min; __le32 dtseg2_max; __le32 dsjw_max; __le32 dbrp_min; __le32 dbrp_max; __le32 dbrp_inc; } __packed; #define GS_CAN_FLAG_OVERFLOW BIT(0) #define GS_CAN_FLAG_FD BIT(1) #define GS_CAN_FLAG_BRS BIT(2) #define GS_CAN_FLAG_ESI BIT(3) struct classic_can { u8 data[8]; } __packed; struct classic_can_ts { u8 data[8]; __le32 timestamp_us; } __packed; struct classic_can_quirk { u8 data[8]; u8 quirk; } __packed; struct canfd { u8 data[64]; } __packed; struct canfd_ts { u8 data[64]; __le32 timestamp_us; } __packed; struct canfd_quirk { u8 data[64]; u8 quirk; } __packed; struct gs_host_frame { u32 echo_id; __le32 can_id; u8 can_dlc; u8 channel; u8 flags; u8 reserved; union { DECLARE_FLEX_ARRAY(struct classic_can, classic_can); DECLARE_FLEX_ARRAY(struct classic_can_ts, classic_can_ts); DECLARE_FLEX_ARRAY(struct classic_can_quirk, classic_can_quirk); DECLARE_FLEX_ARRAY(struct canfd, canfd); DECLARE_FLEX_ARRAY(struct canfd_ts, canfd_ts); DECLARE_FLEX_ARRAY(struct canfd_quirk, canfd_quirk); }; } __packed; /* The GS USB devices make use of the same flags and masks as in * linux/can.h and linux/can/error.h, and no additional mapping is necessary. */ /* Only send a max of GS_MAX_TX_URBS frames per channel at a time. */ #define GS_MAX_TX_URBS 10 /* Only launch a max of GS_MAX_RX_URBS usb requests at a time. */ #define GS_MAX_RX_URBS 30 #define GS_NAPI_WEIGHT 32 /* Maximum number of interfaces the driver supports per device. * Current hardware only supports 3 interfaces. The future may vary. */ #define GS_MAX_INTF 3 struct gs_tx_context { struct gs_can *dev; unsigned int echo_id; }; struct gs_can { struct can_priv can; /* must be the first member */ struct can_rx_offload offload; struct gs_usb *parent; struct net_device *netdev; struct usb_device *udev; struct can_bittiming_const bt_const, data_bt_const; unsigned int channel; /* channel number */ u32 feature; unsigned int hf_size_tx; /* This lock prevents a race condition between xmit and receive. */ spinlock_t tx_ctx_lock; struct gs_tx_context tx_context[GS_MAX_TX_URBS]; struct usb_anchor tx_submitted; atomic_t active_tx_urbs; }; /* usb interface struct */ struct gs_usb { struct gs_can *canch[GS_MAX_INTF]; struct usb_anchor rx_submitted; struct usb_device *udev; /* time counter for hardware timestamps */ struct cyclecounter cc; struct timecounter tc; spinlock_t tc_lock; /* spinlock to guard access tc->cycle_last */ struct delayed_work timestamp; unsigned int hf_size_rx; u8 active_channels; }; /* 'allocate' a tx context. * returns a valid tx context or NULL if there is no space. */ static struct gs_tx_context *gs_alloc_tx_context(struct gs_can *dev) { int i = 0; unsigned long flags; spin_lock_irqsave(&dev->tx_ctx_lock, flags); for (; i < GS_MAX_TX_URBS; i++) { if (dev->tx_context[i].echo_id == GS_MAX_TX_URBS) { dev->tx_context[i].echo_id = i; spin_unlock_irqrestore(&dev->tx_ctx_lock, flags); return &dev->tx_context[i]; } } spin_unlock_irqrestore(&dev->tx_ctx_lock, flags); return NULL; } /* releases a tx context */ static void gs_free_tx_context(struct gs_tx_context *txc) { txc->echo_id = GS_MAX_TX_URBS; } /* Get a tx context by id. */ static struct gs_tx_context *gs_get_tx_context(struct gs_can *dev, unsigned int id) { unsigned long flags; if (id < GS_MAX_TX_URBS) { spin_lock_irqsave(&dev->tx_ctx_lock, flags); if (dev->tx_context[id].echo_id == id) { spin_unlock_irqrestore(&dev->tx_ctx_lock, flags); return &dev->tx_context[id]; } spin_unlock_irqrestore(&dev->tx_ctx_lock, flags); } return NULL; } static int gs_cmd_reset(struct gs_can *dev) { struct gs_device_mode dm = { .mode = GS_CAN_MODE_RESET, }; return usb_control_msg_send(dev->udev, 0, GS_USB_BREQ_MODE, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_INTERFACE, dev->channel, 0, &dm, sizeof(dm), 1000, GFP_KERNEL); } static inline int gs_usb_get_timestamp(const struct gs_usb *parent, u32 *timestamp_p) { __le32 timestamp; int rc; rc = usb_control_msg_recv(parent->udev, 0, GS_USB_BREQ_TIMESTAMP, USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_INTERFACE, 0, 0, ×tamp, sizeof(timestamp), USB_CTRL_GET_TIMEOUT, GFP_KERNEL); if (rc) return rc; *timestamp_p = le32_to_cpu(timestamp); return 0; } static u64 gs_usb_timestamp_read(const struct cyclecounter *cc) __must_hold(&dev->tc_lock) { struct gs_usb *parent = container_of(cc, struct gs_usb, cc); u32 timestamp = 0; int err; lockdep_assert_held(&parent->tc_lock); /* drop lock for synchronous USB transfer */ spin_unlock_bh(&parent->tc_lock); err = gs_usb_get_timestamp(parent, ×tamp); spin_lock_bh(&parent->tc_lock); if (err) dev_err(&parent->udev->dev, "Error %d while reading timestamp. HW timestamps may be inaccurate.", err); return timestamp; } static void gs_usb_timestamp_work(struct work_struct *work) { struct delayed_work *delayed_work = to_delayed_work(work); struct gs_usb *parent; parent = container_of(delayed_work, struct gs_usb, timestamp); spin_lock_bh(&parent->tc_lock); timecounter_read(&parent->tc); spin_unlock_bh(&parent->tc_lock); schedule_delayed_work(&parent->timestamp, GS_USB_TIMESTAMP_WORK_DELAY_SEC * HZ); } static void gs_usb_skb_set_timestamp(struct gs_can *dev, struct sk_buff *skb, u32 timestamp) { struct skb_shared_hwtstamps *hwtstamps = skb_hwtstamps(skb); struct gs_usb *parent = dev->parent; u64 ns; spin_lock_bh(&parent->tc_lock); ns = timecounter_cyc2time(&parent->tc, timestamp); spin_unlock_bh(&parent->tc_lock); hwtstamps->hwtstamp = ns_to_ktime(ns); } static void gs_usb_timestamp_init(struct gs_usb *parent) { struct cyclecounter *cc = &parent->cc; cc->read = gs_usb_timestamp_read; cc->mask = CYCLECOUNTER_MASK(32); cc->shift = 32 - bits_per(NSEC_PER_SEC / GS_USB_TIMESTAMP_TIMER_HZ); cc->mult = clocksource_hz2mult(GS_USB_TIMESTAMP_TIMER_HZ, cc->shift); spin_lock_init(&parent->tc_lock); spin_lock_bh(&parent->tc_lock); timecounter_init(&parent->tc, &parent->cc, ktime_get_real_ns()); spin_unlock_bh(&parent->tc_lock); INIT_DELAYED_WORK(&parent->timestamp, gs_usb_timestamp_work); schedule_delayed_work(&parent->timestamp, GS_USB_TIMESTAMP_WORK_DELAY_SEC * HZ); } static void gs_usb_timestamp_stop(struct gs_usb *parent) { cancel_delayed_work_sync(&parent->timestamp); } static void gs_update_state(struct gs_can *dev, struct can_frame *cf) { struct can_device_stats *can_stats = &dev->can.can_stats; if (cf->can_id & CAN_ERR_RESTARTED) { dev->can.state = CAN_STATE_ERROR_ACTIVE; can_stats->restarts++; } else if (cf->can_id & CAN_ERR_BUSOFF) { dev->can.state = CAN_STATE_BUS_OFF; can_stats->bus_off++; } else if (cf->can_id & CAN_ERR_CRTL) { if ((cf->data[1] & CAN_ERR_CRTL_TX_WARNING) || (cf->data[1] & CAN_ERR_CRTL_RX_WARNING)) { dev->can.state = CAN_STATE_ERROR_WARNING; can_stats->error_warning++; } else if ((cf->data[1] & CAN_ERR_CRTL_TX_PASSIVE) || (cf->data[1] & CAN_ERR_CRTL_RX_PASSIVE)) { dev->can.state = CAN_STATE_ERROR_PASSIVE; can_stats->error_passive++; } else { dev->can.state = CAN_STATE_ERROR_ACTIVE; } } } static u32 gs_usb_set_timestamp(struct gs_can *dev, struct sk_buff *skb, const struct gs_host_frame *hf) { u32 timestamp; if (hf->flags & GS_CAN_FLAG_FD) timestamp = le32_to_cpu(hf->canfd_ts->timestamp_us); else timestamp = le32_to_cpu(hf->classic_can_ts->timestamp_us); if (skb) gs_usb_skb_set_timestamp(dev, skb, timestamp); return timestamp; } static void gs_usb_rx_offload(struct gs_can *dev, struct sk_buff *skb, const struct gs_host_frame *hf) { struct can_rx_offload *offload = &dev->offload; int rc; if (dev->feature & GS_CAN_FEATURE_HW_TIMESTAMP) { const u32 ts = gs_usb_set_timestamp(dev, skb, hf); rc = can_rx_offload_queue_timestamp(offload, skb, ts); } else { rc = can_rx_offload_queue_tail(offload, skb); } if (rc) dev->netdev->stats.rx_fifo_errors++; } static unsigned int gs_usb_get_echo_skb(struct gs_can *dev, struct sk_buff *skb, const struct gs_host_frame *hf) { struct can_rx_offload *offload = &dev->offload; const u32 echo_id = hf->echo_id; unsigned int len; if (dev->feature & GS_CAN_FEATURE_HW_TIMESTAMP) { const u32 ts = gs_usb_set_timestamp(dev, skb, hf); len = can_rx_offload_get_echo_skb_queue_timestamp(offload, echo_id, ts, NULL); } else { len = can_rx_offload_get_echo_skb_queue_tail(offload, echo_id, NULL); } return len; } static void gs_usb_receive_bulk_callback(struct urb *urb) { struct gs_usb *parent = urb->context; struct gs_can *dev; struct net_device *netdev; int rc; struct net_device_stats *stats; struct gs_host_frame *hf = urb->transfer_buffer; struct gs_tx_context *txc; struct can_frame *cf; struct canfd_frame *cfd; struct sk_buff *skb; BUG_ON(!parent); switch (urb->status) { case 0: /* success */ break; case -ENOENT: case -ESHUTDOWN: return; default: /* do not resubmit aborted urbs. eg: when device goes down */ return; } /* device reports out of range channel id */ if (hf->channel >= GS_MAX_INTF) goto device_detach; dev = parent->canch[hf->channel]; netdev = dev->netdev; stats = &netdev->stats; if (!netif_device_present(netdev)) return; if (!netif_running(netdev)) goto resubmit_urb; if (hf->echo_id == -1) { /* normal rx */ if (hf->flags & GS_CAN_FLAG_FD) { skb = alloc_canfd_skb(netdev, &cfd); if (!skb) return; cfd->can_id = le32_to_cpu(hf->can_id); cfd->len = can_fd_dlc2len(hf->can_dlc); if (hf->flags & GS_CAN_FLAG_BRS) cfd->flags |= CANFD_BRS; if (hf->flags & GS_CAN_FLAG_ESI) cfd->flags |= CANFD_ESI; memcpy(cfd->data, hf->canfd->data, cfd->len); } else { skb = alloc_can_skb(netdev, &cf); if (!skb) return; cf->can_id = le32_to_cpu(hf->can_id); can_frame_set_cc_len(cf, hf->can_dlc, dev->can.ctrlmode); memcpy(cf->data, hf->classic_can->data, 8); /* ERROR frames tell us information about the controller */ if (le32_to_cpu(hf->can_id) & CAN_ERR_FLAG) gs_update_state(dev, cf); } gs_usb_rx_offload(dev, skb, hf); } else { /* echo_id == hf->echo_id */ if (hf->echo_id >= GS_MAX_TX_URBS) { netdev_err(netdev, "Unexpected out of range echo id %u\n", hf->echo_id); goto resubmit_urb; } txc = gs_get_tx_context(dev, hf->echo_id); /* bad devices send bad echo_ids. */ if (!txc) { netdev_err(netdev, "Unexpected unused echo id %u\n", hf->echo_id); goto resubmit_urb; } skb = dev->can.echo_skb[hf->echo_id]; stats->tx_packets++; stats->tx_bytes += gs_usb_get_echo_skb(dev, skb, hf); gs_free_tx_context(txc); atomic_dec(&dev->active_tx_urbs); netif_wake_queue(netdev); } if (hf->flags & GS_CAN_FLAG_OVERFLOW) { stats->rx_over_errors++; stats->rx_errors++; skb = alloc_can_err_skb(netdev, &cf); if (!skb) goto resubmit_urb; cf->can_id |= CAN_ERR_CRTL; cf->len = CAN_ERR_DLC; cf->data[1] = CAN_ERR_CRTL_RX_OVERFLOW; gs_usb_rx_offload(dev, skb, hf); } can_rx_offload_irq_finish(&dev->offload); resubmit_urb: usb_fill_bulk_urb(urb, parent->udev, usb_rcvbulkpipe(parent->udev, GS_USB_ENDPOINT_IN), hf, dev->parent->hf_size_rx, gs_usb_receive_bulk_callback, parent); rc = usb_submit_urb(urb, GFP_ATOMIC); /* USB failure take down all interfaces */ if (rc == -ENODEV) { device_detach: for (rc = 0; rc < GS_MAX_INTF; rc++) { if (parent->canch[rc]) netif_device_detach(parent->canch[rc]->netdev); } } } static int gs_usb_set_bittiming(struct net_device *netdev) { struct gs_can *dev = netdev_priv(netdev); struct can_bittiming *bt = &dev->can.bittiming; struct gs_device_bittiming dbt = { .prop_seg = cpu_to_le32(bt->prop_seg), .phase_seg1 = cpu_to_le32(bt->phase_seg1), .phase_seg2 = cpu_to_le32(bt->phase_seg2), .sjw = cpu_to_le32(bt->sjw), .brp = cpu_to_le32(bt->brp), }; /* request bit timings */ return usb_control_msg_send(dev->udev, 0, GS_USB_BREQ_BITTIMING, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_INTERFACE, dev->channel, 0, &dbt, sizeof(dbt), 1000, GFP_KERNEL); } static int gs_usb_set_data_bittiming(struct net_device *netdev) { struct gs_can *dev = netdev_priv(netdev); struct can_bittiming *bt = &dev->can.data_bittiming; struct gs_device_bittiming dbt = { .prop_seg = cpu_to_le32(bt->prop_seg), .phase_seg1 = cpu_to_le32(bt->phase_seg1), .phase_seg2 = cpu_to_le32(bt->phase_seg2), .sjw = cpu_to_le32(bt->sjw), .brp = cpu_to_le32(bt->brp), }; u8 request = GS_USB_BREQ_DATA_BITTIMING; if (dev->feature & GS_CAN_FEATURE_QUIRK_BREQ_CANTACT_PRO) request = GS_USB_BREQ_QUIRK_CANTACT_PRO_DATA_BITTIMING; /* request data bit timings */ return usb_control_msg_send(dev->udev, 0, request, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_INTERFACE, dev->channel, 0, &dbt, sizeof(dbt), 1000, GFP_KERNEL); } static void gs_usb_xmit_callback(struct urb *urb) { struct gs_tx_context *txc = urb->context; struct gs_can *dev = txc->dev; struct net_device *netdev = dev->netdev; if (urb->status) netdev_info(netdev, "usb xmit fail %u\n", txc->echo_id); } static netdev_tx_t gs_can_start_xmit(struct sk_buff *skb, struct net_device *netdev) { struct gs_can *dev = netdev_priv(netdev); struct net_device_stats *stats = &dev->netdev->stats; struct urb *urb; struct gs_host_frame *hf; struct can_frame *cf; struct canfd_frame *cfd; int rc; unsigned int idx; struct gs_tx_context *txc; if (can_dev_dropped_skb(netdev, skb)) return NETDEV_TX_OK; /* find an empty context to keep track of transmission */ txc = gs_alloc_tx_context(dev); if (!txc) return NETDEV_TX_BUSY; /* create a URB, and a buffer for it */ urb = usb_alloc_urb(0, GFP_ATOMIC); if (!urb) goto nomem_urb; hf = kmalloc(dev->hf_size_tx, GFP_ATOMIC); if (!hf) goto nomem_hf; idx = txc->echo_id; if (idx >= GS_MAX_TX_URBS) { netdev_err(netdev, "Invalid tx context %u\n", idx); goto badidx; } hf->echo_id = idx; hf->channel = dev->channel; hf->flags = 0; hf->reserved = 0; if (can_is_canfd_skb(skb)) { cfd = (struct canfd_frame *)skb->data; hf->can_id = cpu_to_le32(cfd->can_id); hf->can_dlc = can_fd_len2dlc(cfd->len); hf->flags |= GS_CAN_FLAG_FD; if (cfd->flags & CANFD_BRS) hf->flags |= GS_CAN_FLAG_BRS; if (cfd->flags & CANFD_ESI) hf->flags |= GS_CAN_FLAG_ESI; memcpy(hf->canfd->data, cfd->data, cfd->len); } else { cf = (struct can_frame *)skb->data; hf->can_id = cpu_to_le32(cf->can_id); hf->can_dlc = can_get_cc_dlc(cf, dev->can.ctrlmode); memcpy(hf->classic_can->data, cf->data, cf->len); } usb_fill_bulk_urb(urb, dev->udev, usb_sndbulkpipe(dev->udev, GS_USB_ENDPOINT_OUT), hf, dev->hf_size_tx, gs_usb_xmit_callback, txc); urb->transfer_flags |= URB_FREE_BUFFER; usb_anchor_urb(urb, &dev->tx_submitted); can_put_echo_skb(skb, netdev, idx, 0); atomic_inc(&dev->active_tx_urbs); rc = usb_submit_urb(urb, GFP_ATOMIC); if (unlikely(rc)) { /* usb send failed */ atomic_dec(&dev->active_tx_urbs); can_free_echo_skb(netdev, idx, NULL); gs_free_tx_context(txc); usb_unanchor_urb(urb); if (rc == -ENODEV) { netif_device_detach(netdev); } else { netdev_err(netdev, "usb_submit failed (err=%d)\n", rc); stats->tx_dropped++; } } else { /* Slow down tx path */ if (atomic_read(&dev->active_tx_urbs) >= GS_MAX_TX_URBS) netif_stop_queue(netdev); } /* let usb core take care of this urb */ usb_free_urb(urb); return NETDEV_TX_OK; badidx: kfree(hf); nomem_hf: usb_free_urb(urb); nomem_urb: gs_free_tx_context(txc); dev_kfree_skb(skb); stats->tx_dropped++; return NETDEV_TX_OK; } static int gs_can_open(struct net_device *netdev) { struct gs_can *dev = netdev_priv(netdev); struct gs_usb *parent = dev->parent; struct gs_device_mode dm = { .mode = cpu_to_le32(GS_CAN_MODE_START), }; struct gs_host_frame *hf; struct urb *urb = NULL; u32 ctrlmode; u32 flags = 0; int rc, i; rc = open_candev(netdev); if (rc) return rc; ctrlmode = dev->can.ctrlmode; if (ctrlmode & CAN_CTRLMODE_FD) { if (dev->feature & GS_CAN_FEATURE_REQ_USB_QUIRK_LPC546XX) dev->hf_size_tx = struct_size(hf, canfd_quirk, 1); else dev->hf_size_tx = struct_size(hf, canfd, 1); } else { if (dev->feature & GS_CAN_FEATURE_REQ_USB_QUIRK_LPC546XX) dev->hf_size_tx = struct_size(hf, classic_can_quirk, 1); else dev->hf_size_tx = struct_size(hf, classic_can, 1); } can_rx_offload_enable(&dev->offload); if (!parent->active_channels) { if (dev->feature & GS_CAN_FEATURE_HW_TIMESTAMP) gs_usb_timestamp_init(parent); for (i = 0; i < GS_MAX_RX_URBS; i++) { u8 *buf; /* alloc rx urb */ urb = usb_alloc_urb(0, GFP_KERNEL); if (!urb) { rc = -ENOMEM; goto out_usb_kill_anchored_urbs; } /* alloc rx buffer */ buf = kmalloc(dev->parent->hf_size_rx, GFP_KERNEL); if (!buf) { rc = -ENOMEM; goto out_usb_free_urb; } /* fill, anchor, and submit rx urb */ usb_fill_bulk_urb(urb, dev->udev, usb_rcvbulkpipe(dev->udev, GS_USB_ENDPOINT_IN), buf, dev->parent->hf_size_rx, gs_usb_receive_bulk_callback, parent); urb->transfer_flags |= URB_FREE_BUFFER; usb_anchor_urb(urb, &parent->rx_submitted); rc = usb_submit_urb(urb, GFP_KERNEL); if (rc) { if (rc == -ENODEV) netif_device_detach(dev->netdev); netdev_err(netdev, "usb_submit_urb() failed, error %pe\n", ERR_PTR(rc)); goto out_usb_unanchor_urb; } /* Drop reference, * USB core will take care of freeing it */ usb_free_urb(urb); } } /* flags */ if (ctrlmode & CAN_CTRLMODE_LOOPBACK) flags |= GS_CAN_MODE_LOOP_BACK; if (ctrlmode & CAN_CTRLMODE_LISTENONLY) flags |= GS_CAN_MODE_LISTEN_ONLY; if (ctrlmode & CAN_CTRLMODE_3_SAMPLES) flags |= GS_CAN_MODE_TRIPLE_SAMPLE; if (ctrlmode & CAN_CTRLMODE_ONE_SHOT) flags |= GS_CAN_MODE_ONE_SHOT; if (ctrlmode & CAN_CTRLMODE_BERR_REPORTING) flags |= GS_CAN_MODE_BERR_REPORTING; if (ctrlmode & CAN_CTRLMODE_FD) flags |= GS_CAN_MODE_FD; /* if hardware supports timestamps, enable it */ if (dev->feature & GS_CAN_FEATURE_HW_TIMESTAMP) flags |= GS_CAN_MODE_HW_TIMESTAMP; /* finally start device */ dev->can.state = CAN_STATE_ERROR_ACTIVE; dm.flags = cpu_to_le32(flags); rc = usb_control_msg_send(dev->udev, 0, GS_USB_BREQ_MODE, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_INTERFACE, dev->channel, 0, &dm, sizeof(dm), 1000, GFP_KERNEL); if (rc) { netdev_err(netdev, "Couldn't start device (err=%d)\n", rc); dev->can.state = CAN_STATE_STOPPED; goto out_usb_kill_anchored_urbs; } parent->active_channels++; if (!(dev->can.ctrlmode & CAN_CTRLMODE_LISTENONLY)) netif_start_queue(netdev); return 0; out_usb_unanchor_urb: usb_unanchor_urb(urb); out_usb_free_urb: usb_free_urb(urb); out_usb_kill_anchored_urbs: if (!parent->active_channels) { usb_kill_anchored_urbs(&dev->tx_submitted); if (dev->feature & GS_CAN_FEATURE_HW_TIMESTAMP) gs_usb_timestamp_stop(parent); } can_rx_offload_disable(&dev->offload); close_candev(netdev); return rc; } static int gs_usb_get_state(const struct net_device *netdev, struct can_berr_counter *bec, enum can_state *state) { struct gs_can *dev = netdev_priv(netdev); struct gs_device_state ds; int rc; rc = usb_control_msg_recv(dev->udev, 0, GS_USB_BREQ_GET_STATE, USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_INTERFACE, dev->channel, 0, &ds, sizeof(ds), USB_CTRL_GET_TIMEOUT, GFP_KERNEL); if (rc) return rc; if (le32_to_cpu(ds.state) >= CAN_STATE_MAX) return -EOPNOTSUPP; *state = le32_to_cpu(ds.state); bec->txerr = le32_to_cpu(ds.txerr); bec->rxerr = le32_to_cpu(ds.rxerr); return 0; } static int gs_usb_can_get_berr_counter(const struct net_device *netdev, struct can_berr_counter *bec) { enum can_state state; return gs_usb_get_state(netdev, bec, &state); } static int gs_can_close(struct net_device *netdev) { int rc; struct gs_can *dev = netdev_priv(netdev); struct gs_usb *parent = dev->parent; netif_stop_queue(netdev); /* Stop polling */ parent->active_channels--; if (!parent->active_channels) { usb_kill_anchored_urbs(&parent->rx_submitted); if (dev->feature & GS_CAN_FEATURE_HW_TIMESTAMP) gs_usb_timestamp_stop(parent); } /* Stop sending URBs */ usb_kill_anchored_urbs(&dev->tx_submitted); atomic_set(&dev->active_tx_urbs, 0); dev->can.state = CAN_STATE_STOPPED; /* reset the device */ gs_cmd_reset(dev); /* reset tx contexts */ for (rc = 0; rc < GS_MAX_TX_URBS; rc++) { dev->tx_context[rc].dev = dev; dev->tx_context[rc].echo_id = GS_MAX_TX_URBS; } can_rx_offload_disable(&dev->offload); /* close the netdev */ close_candev(netdev); return 0; } static int gs_can_eth_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd) { const struct gs_can *dev = netdev_priv(netdev); if (dev->feature & GS_CAN_FEATURE_HW_TIMESTAMP) return can_eth_ioctl_hwts(netdev, ifr, cmd); return -EOPNOTSUPP; } static const struct net_device_ops gs_usb_netdev_ops = { .ndo_open = gs_can_open, .ndo_stop = gs_can_close, .ndo_start_xmit = gs_can_start_xmit, .ndo_change_mtu = can_change_mtu, .ndo_eth_ioctl = gs_can_eth_ioctl, }; static int gs_usb_set_identify(struct net_device *netdev, bool do_identify) { struct gs_can *dev = netdev_priv(netdev); struct gs_identify_mode imode; if (do_identify) imode.mode = cpu_to_le32(GS_CAN_IDENTIFY_ON); else imode.mode = cpu_to_le32(GS_CAN_IDENTIFY_OFF); return usb_control_msg_send(dev->udev, 0, GS_USB_BREQ_IDENTIFY, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_INTERFACE, dev->channel, 0, &imode, sizeof(imode), 100, GFP_KERNEL); } /* blink LED's for finding the this interface */ static int gs_usb_set_phys_id(struct net_device *netdev, enum ethtool_phys_id_state state) { const struct gs_can *dev = netdev_priv(netdev); int rc = 0; if (!(dev->feature & GS_CAN_FEATURE_IDENTIFY)) return -EOPNOTSUPP; switch (state) { case ETHTOOL_ID_ACTIVE: rc = gs_usb_set_identify(netdev, GS_CAN_IDENTIFY_ON); break; case ETHTOOL_ID_INACTIVE: rc = gs_usb_set_identify(netdev, GS_CAN_IDENTIFY_OFF); break; default: break; } return rc; } static int gs_usb_get_ts_info(struct net_device *netdev, struct ethtool_ts_info *info) { struct gs_can *dev = netdev_priv(netdev); /* report if device supports HW timestamps */ if (dev->feature & GS_CAN_FEATURE_HW_TIMESTAMP) return can_ethtool_op_get_ts_info_hwts(netdev, info); return ethtool_op_get_ts_info(netdev, info); } static const struct ethtool_ops gs_usb_ethtool_ops = { .set_phys_id = gs_usb_set_phys_id, .get_ts_info = gs_usb_get_ts_info, }; static int gs_usb_get_termination(struct net_device *netdev, u16 *term) { struct gs_can *dev = netdev_priv(netdev); struct gs_device_termination_state term_state; int rc; rc = usb_control_msg_recv(dev->udev, 0, GS_USB_BREQ_GET_TERMINATION, USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_INTERFACE, dev->channel, 0, &term_state, sizeof(term_state), 1000, GFP_KERNEL); if (rc) return rc; if (term_state.state == cpu_to_le32(GS_CAN_TERMINATION_STATE_ON)) *term = GS_USB_TERMINATION_ENABLED; else *term = GS_USB_TERMINATION_DISABLED; return 0; } static int gs_usb_set_termination(struct net_device *netdev, u16 term) { struct gs_can *dev = netdev_priv(netdev); struct gs_device_termination_state term_state; if (term == GS_USB_TERMINATION_ENABLED) term_state.state = cpu_to_le32(GS_CAN_TERMINATION_STATE_ON); else term_state.state = cpu_to_le32(GS_CAN_TERMINATION_STATE_OFF); return usb_control_msg_send(dev->udev, 0, GS_USB_BREQ_SET_TERMINATION, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_INTERFACE, dev->channel, 0, &term_state, sizeof(term_state), 1000, GFP_KERNEL); } static const u16 gs_usb_termination_const[] = { GS_USB_TERMINATION_DISABLED, GS_USB_TERMINATION_ENABLED }; static struct gs_can *gs_make_candev(unsigned int channel, struct usb_interface *intf, struct gs_device_config *dconf) { struct gs_can *dev; struct net_device *netdev; int rc; struct gs_device_bt_const_extended bt_const_extended; struct gs_device_bt_const bt_const; u32 feature; /* fetch bit timing constants */ rc = usb_control_msg_recv(interface_to_usbdev(intf), 0, GS_USB_BREQ_BT_CONST, USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_INTERFACE, channel, 0, &bt_const, sizeof(bt_const), 1000, GFP_KERNEL); if (rc) { dev_err(&intf->dev, "Couldn't get bit timing const for channel %d (%pe)\n", channel, ERR_PTR(rc)); return ERR_PTR(rc); } /* create netdev */ netdev = alloc_candev(sizeof(struct gs_can), GS_MAX_TX_URBS); if (!netdev) { dev_err(&intf->dev, "Couldn't allocate candev\n"); return ERR_PTR(-ENOMEM); } dev = netdev_priv(netdev); netdev->netdev_ops = &gs_usb_netdev_ops; netdev->ethtool_ops = &gs_usb_ethtool_ops; netdev->flags |= IFF_ECHO; /* we support full roundtrip echo */ netdev->dev_id = channel; /* dev setup */ strcpy(dev->bt_const.name, KBUILD_MODNAME); dev->bt_const.tseg1_min = le32_to_cpu(bt_const.tseg1_min); dev->bt_const.tseg1_max = le32_to_cpu(bt_const.tseg1_max); dev->bt_const.tseg2_min = le32_to_cpu(bt_const.tseg2_min); dev->bt_const.tseg2_max = le32_to_cpu(bt_const.tseg2_max); dev->bt_const.sjw_max = le32_to_cpu(bt_const.sjw_max); dev->bt_const.brp_min = le32_to_cpu(bt_const.brp_min); dev->bt_const.brp_max = le32_to_cpu(bt_const.brp_max); dev->bt_const.brp_inc = le32_to_cpu(bt_const.brp_inc); dev->udev = interface_to_usbdev(intf); dev->netdev = netdev; dev->channel = channel; init_usb_anchor(&dev->tx_submitted); atomic_set(&dev->active_tx_urbs, 0); spin_lock_init(&dev->tx_ctx_lock); for (rc = 0; rc < GS_MAX_TX_URBS; rc++) { dev->tx_context[rc].dev = dev; dev->tx_context[rc].echo_id = GS_MAX_TX_URBS; } /* can setup */ dev->can.state = CAN_STATE_STOPPED; dev->can.clock.freq = le32_to_cpu(bt_const.fclk_can); dev->can.bittiming_const = &dev->bt_const; dev->can.do_set_bittiming = gs_usb_set_bittiming; dev->can.ctrlmode_supported = CAN_CTRLMODE_CC_LEN8_DLC; feature = le32_to_cpu(bt_const.feature); dev->feature = FIELD_GET(GS_CAN_FEATURE_MASK, feature); if (feature & GS_CAN_FEATURE_LISTEN_ONLY) dev->can.ctrlmode_supported |= CAN_CTRLMODE_LISTENONLY; if (feature & GS_CAN_FEATURE_LOOP_BACK) dev->can.ctrlmode_supported |= CAN_CTRLMODE_LOOPBACK; if (feature & GS_CAN_FEATURE_TRIPLE_SAMPLE) dev->can.ctrlmode_supported |= CAN_CTRLMODE_3_SAMPLES; if (feature & GS_CAN_FEATURE_ONE_SHOT) dev->can.ctrlmode_supported |= CAN_CTRLMODE_ONE_SHOT; if (feature & GS_CAN_FEATURE_FD) { dev->can.ctrlmode_supported |= CAN_CTRLMODE_FD; /* The data bit timing will be overwritten, if * GS_CAN_FEATURE_BT_CONST_EXT is set. */ dev->can.data_bittiming_const = &dev->bt_const; dev->can.do_set_data_bittiming = gs_usb_set_data_bittiming; } if (feature & GS_CAN_FEATURE_TERMINATION) { rc = gs_usb_get_termination(netdev, &dev->can.termination); if (rc) { dev->feature &= ~GS_CAN_FEATURE_TERMINATION; dev_info(&intf->dev, "Disabling termination support for channel %d (%pe)\n", channel, ERR_PTR(rc)); } else { dev->can.termination_const = gs_usb_termination_const; dev->can.termination_const_cnt = ARRAY_SIZE(gs_usb_termination_const); dev->can.do_set_termination = gs_usb_set_termination; } } if (feature & GS_CAN_FEATURE_BERR_REPORTING) dev->can.ctrlmode_supported |= CAN_CTRLMODE_BERR_REPORTING; if (feature & GS_CAN_FEATURE_GET_STATE) dev->can.do_get_berr_counter = gs_usb_can_get_berr_counter; /* The CANtact Pro from LinkLayer Labs is based on the * LPC54616 µC, which is affected by the NXP LPC USB transfer * erratum. However, the current firmware (version 2) doesn't * set the GS_CAN_FEATURE_REQ_USB_QUIRK_LPC546XX bit. Set the * feature GS_CAN_FEATURE_REQ_USB_QUIRK_LPC546XX to workaround * this issue. * * For the GS_USB_BREQ_DATA_BITTIMING USB control message the * CANtact Pro firmware uses a request value, which is already * used by the candleLight firmware for a different purpose * (GS_USB_BREQ_GET_USER_ID). Set the feature * GS_CAN_FEATURE_QUIRK_BREQ_CANTACT_PRO to workaround this * issue. */ if (dev->udev->descriptor.idVendor == cpu_to_le16(USB_GS_USB_1_VENDOR_ID) && dev->udev->descriptor.idProduct == cpu_to_le16(USB_GS_USB_1_PRODUCT_ID) && dev->udev->manufacturer && dev->udev->product && !strcmp(dev->udev->manufacturer, "LinkLayer Labs") && !strcmp(dev->udev->product, "CANtact Pro") && (le32_to_cpu(dconf->sw_version) <= 2)) dev->feature |= GS_CAN_FEATURE_REQ_USB_QUIRK_LPC546XX | GS_CAN_FEATURE_QUIRK_BREQ_CANTACT_PRO; /* GS_CAN_FEATURE_IDENTIFY is only supported for sw_version > 1 */ if (!(le32_to_cpu(dconf->sw_version) > 1 && feature & GS_CAN_FEATURE_IDENTIFY)) dev->feature &= ~GS_CAN_FEATURE_IDENTIFY; /* fetch extended bit timing constants if device has feature * GS_CAN_FEATURE_FD and GS_CAN_FEATURE_BT_CONST_EXT */ if (feature & GS_CAN_FEATURE_FD && feature & GS_CAN_FEATURE_BT_CONST_EXT) { rc = usb_control_msg_recv(interface_to_usbdev(intf), 0, GS_USB_BREQ_BT_CONST_EXT, USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_INTERFACE, channel, 0, &bt_const_extended, sizeof(bt_const_extended), 1000, GFP_KERNEL); if (rc) { dev_err(&intf->dev, "Couldn't get extended bit timing const for channel %d (%pe)\n", channel, ERR_PTR(rc)); goto out_free_candev; } strcpy(dev->data_bt_const.name, KBUILD_MODNAME); dev->data_bt_const.tseg1_min = le32_to_cpu(bt_const_extended.dtseg1_min); dev->data_bt_const.tseg1_max = le32_to_cpu(bt_const_extended.dtseg1_max); dev->data_bt_const.tseg2_min = le32_to_cpu(bt_const_extended.dtseg2_min); dev->data_bt_const.tseg2_max = le32_to_cpu(bt_const_extended.dtseg2_max); dev->data_bt_const.sjw_max = le32_to_cpu(bt_const_extended.dsjw_max); dev->data_bt_const.brp_min = le32_to_cpu(bt_const_extended.dbrp_min); dev->data_bt_const.brp_max = le32_to_cpu(bt_const_extended.dbrp_max); dev->data_bt_const.brp_inc = le32_to_cpu(bt_const_extended.dbrp_inc); dev->can.data_bittiming_const = &dev->data_bt_const; } can_rx_offload_add_manual(netdev, &dev->offload, GS_NAPI_WEIGHT); SET_NETDEV_DEV(netdev, &intf->dev); rc = register_candev(dev->netdev); if (rc) { dev_err(&intf->dev, "Couldn't register candev for channel %d (%pe)\n", channel, ERR_PTR(rc)); goto out_can_rx_offload_del; } return dev; out_can_rx_offload_del: can_rx_offload_del(&dev->offload); out_free_candev: free_candev(dev->netdev); return ERR_PTR(rc); } static void gs_destroy_candev(struct gs_can *dev) { unregister_candev(dev->netdev); can_rx_offload_del(&dev->offload); free_candev(dev->netdev); } static int gs_usb_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct usb_device *udev = interface_to_usbdev(intf); struct gs_host_frame *hf; struct gs_usb *parent; struct gs_host_config hconf = { .byte_order = cpu_to_le32(0x0000beef), }; struct gs_device_config dconf; unsigned int icount, i; int rc; /* send host config */ rc = usb_control_msg_send(udev, 0, GS_USB_BREQ_HOST_FORMAT, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_INTERFACE, 1, intf->cur_altsetting->desc.bInterfaceNumber, &hconf, sizeof(hconf), 1000, GFP_KERNEL); if (rc) { dev_err(&intf->dev, "Couldn't send data format (err=%d)\n", rc); return rc; } /* read device config */ rc = usb_control_msg_recv(udev, 0, GS_USB_BREQ_DEVICE_CONFIG, USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_INTERFACE, 1, intf->cur_altsetting->desc.bInterfaceNumber, &dconf, sizeof(dconf), 1000, GFP_KERNEL); if (rc) { dev_err(&intf->dev, "Couldn't get device config: (err=%d)\n", rc); return rc; } icount = dconf.icount + 1; dev_info(&intf->dev, "Configuring for %u interfaces\n", icount); if (icount > GS_MAX_INTF) { dev_err(&intf->dev, "Driver cannot handle more that %u CAN interfaces\n", GS_MAX_INTF); return -EINVAL; } parent = kzalloc(sizeof(*parent), GFP_KERNEL); if (!parent) return -ENOMEM; init_usb_anchor(&parent->rx_submitted); usb_set_intfdata(intf, parent); parent->udev = udev; for (i = 0; i < icount; i++) { unsigned int hf_size_rx = 0; parent->canch[i] = gs_make_candev(i, intf, &dconf); if (IS_ERR_OR_NULL(parent->canch[i])) { /* save error code to return later */ rc = PTR_ERR(parent->canch[i]); /* on failure destroy previously created candevs */ icount = i; for (i = 0; i < icount; i++) gs_destroy_candev(parent->canch[i]); usb_kill_anchored_urbs(&parent->rx_submitted); kfree(parent); return rc; } parent->canch[i]->parent = parent; /* set RX packet size based on FD and if hardware * timestamps are supported. */ if (parent->canch[i]->can.ctrlmode_supported & CAN_CTRLMODE_FD) { if (parent->canch[i]->feature & GS_CAN_FEATURE_HW_TIMESTAMP) hf_size_rx = struct_size(hf, canfd_ts, 1); else hf_size_rx = struct_size(hf, canfd, 1); } else { if (parent->canch[i]->feature & GS_CAN_FEATURE_HW_TIMESTAMP) hf_size_rx = struct_size(hf, classic_can_ts, 1); else hf_size_rx = struct_size(hf, classic_can, 1); } parent->hf_size_rx = max(parent->hf_size_rx, hf_size_rx); } return 0; } static void gs_usb_disconnect(struct usb_interface *intf) { struct gs_usb *parent = usb_get_intfdata(intf); unsigned int i; usb_set_intfdata(intf, NULL); if (!parent) { dev_err(&intf->dev, "Disconnect (nodata)\n"); return; } for (i = 0; i < GS_MAX_INTF; i++) if (parent->canch[i]) gs_destroy_candev(parent->canch[i]); kfree(parent); } static const struct usb_device_id gs_usb_table[] = { { USB_DEVICE_INTERFACE_NUMBER(USB_GS_USB_1_VENDOR_ID, USB_GS_USB_1_PRODUCT_ID, 0) }, { USB_DEVICE_INTERFACE_NUMBER(USB_CANDLELIGHT_VENDOR_ID, USB_CANDLELIGHT_PRODUCT_ID, 0) }, { USB_DEVICE_INTERFACE_NUMBER(USB_CES_CANEXT_FD_VENDOR_ID, USB_CES_CANEXT_FD_PRODUCT_ID, 0) }, { USB_DEVICE_INTERFACE_NUMBER(USB_ABE_CANDEBUGGER_FD_VENDOR_ID, USB_ABE_CANDEBUGGER_FD_PRODUCT_ID, 0) }, {} /* Terminating entry */ }; MODULE_DEVICE_TABLE(usb, gs_usb_table); static struct usb_driver gs_usb_driver = { .name = KBUILD_MODNAME, .probe = gs_usb_probe, .disconnect = gs_usb_disconnect, .id_table = gs_usb_table, }; module_usb_driver(gs_usb_driver); MODULE_AUTHOR("Maximilian Schneider <mws@schneidersoft.net>"); MODULE_DESCRIPTION( "Socket CAN device driver for Geschwister Schneider Technologie-, " "Entwicklungs- und Vertriebs UG. USB2.0 to CAN interfaces\n" "and bytewerk.org candleLight USB CAN interfaces."); MODULE_LICENSE("GPL v2"); |
5 48 48 48 39 35 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 | /* * include/linux/ktime.h * * ktime_t - nanosecond-resolution time format. * * Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de> * Copyright(C) 2005, Red Hat, Inc., Ingo Molnar * * data type definitions, declarations, prototypes and macros. * * Started by: Thomas Gleixner and Ingo Molnar * * Credits: * * Roman Zippel provided the ideas and primary code snippets of * the ktime_t union and further simplifications of the original * code. * * For licencing details see kernel-base/COPYING */ #ifndef _LINUX_KTIME_H #define _LINUX_KTIME_H #include <asm/bug.h> #include <linux/jiffies.h> #include <linux/time.h> #include <linux/types.h> /** * ktime_set - Set a ktime_t variable from a seconds/nanoseconds value * @secs: seconds to set * @nsecs: nanoseconds to set * * Return: The ktime_t representation of the value. */ static inline ktime_t ktime_set(const s64 secs, const unsigned long nsecs) { if (unlikely(secs >= KTIME_SEC_MAX)) return KTIME_MAX; return secs * NSEC_PER_SEC + (s64)nsecs; } /* Subtract two ktime_t variables. rem = lhs -rhs: */ #define ktime_sub(lhs, rhs) ((lhs) - (rhs)) /* Add two ktime_t variables. res = lhs + rhs: */ #define ktime_add(lhs, rhs) ((lhs) + (rhs)) /* * Same as ktime_add(), but avoids undefined behaviour on overflow; however, * this means that you must check the result for overflow yourself. */ #define ktime_add_unsafe(lhs, rhs) ((u64) (lhs) + (rhs)) /* * Add a ktime_t variable and a scalar nanosecond value. * res = kt + nsval: */ #define ktime_add_ns(kt, nsval) ((kt) + (nsval)) /* * Subtract a scalar nanosecod from a ktime_t variable * res = kt - nsval: */ #define ktime_sub_ns(kt, nsval) ((kt) - (nsval)) /* convert a timespec64 to ktime_t format: */ static inline ktime_t timespec64_to_ktime(struct timespec64 ts) { return ktime_set(ts.tv_sec, ts.tv_nsec); } /* Map the ktime_t to timespec conversion to ns_to_timespec function */ #define ktime_to_timespec64(kt) ns_to_timespec64((kt)) /* Convert ktime_t to nanoseconds */ static inline s64 ktime_to_ns(const ktime_t kt) { return kt; } /** * ktime_compare - Compares two ktime_t variables for less, greater or equal * @cmp1: comparable1 * @cmp2: comparable2 * * Return: ... * cmp1 < cmp2: return <0 * cmp1 == cmp2: return 0 * cmp1 > cmp2: return >0 */ static inline int ktime_compare(const ktime_t cmp1, const ktime_t cmp2) { if (cmp1 < cmp2) return -1; if (cmp1 > cmp2) return 1; return 0; } /** * ktime_after - Compare if a ktime_t value is bigger than another one. * @cmp1: comparable1 * @cmp2: comparable2 * * Return: true if cmp1 happened after cmp2. */ static inline bool ktime_after(const ktime_t cmp1, const ktime_t cmp2) { return ktime_compare(cmp1, cmp2) > 0; } /** * ktime_before - Compare if a ktime_t value is smaller than another one. * @cmp1: comparable1 * @cmp2: comparable2 * * Return: true if cmp1 happened before cmp2. */ static inline bool ktime_before(const ktime_t cmp1, const ktime_t cmp2) { return ktime_compare(cmp1, cmp2) < 0; } #if BITS_PER_LONG < 64 extern s64 __ktime_divns(const ktime_t kt, s64 div); static inline s64 ktime_divns(const ktime_t kt, s64 div) { /* * Negative divisors could cause an inf loop, * so bug out here. */ BUG_ON(div < 0); if (__builtin_constant_p(div) && !(div >> 32)) { s64 ns = kt; u64 tmp = ns < 0 ? -ns : ns; do_div(tmp, div); return ns < 0 ? -tmp : tmp; } else { return __ktime_divns(kt, div); } } #else /* BITS_PER_LONG < 64 */ static inline s64 ktime_divns(const ktime_t kt, s64 div) { /* * 32-bit implementation cannot handle negative divisors, * so catch them on 64bit as well. */ WARN_ON(div < 0); return kt / div; } #endif static inline s64 ktime_to_us(const ktime_t kt) { return ktime_divns(kt, NSEC_PER_USEC); } static inline s64 ktime_to_ms(const ktime_t kt) { return ktime_divns(kt, NSEC_PER_MSEC); } static inline s64 ktime_us_delta(const ktime_t later, const ktime_t earlier) { return ktime_to_us(ktime_sub(later, earlier)); } static inline s64 ktime_ms_delta(const ktime_t later, const ktime_t earlier) { return ktime_to_ms(ktime_sub(later, earlier)); } static inline ktime_t ktime_add_us(const ktime_t kt, const u64 usec) { return ktime_add_ns(kt, usec * NSEC_PER_USEC); } static inline ktime_t ktime_add_ms(const ktime_t kt, const u64 msec) { return ktime_add_ns(kt, msec * NSEC_PER_MSEC); } static inline ktime_t ktime_sub_us(const ktime_t kt, const u64 usec) { return ktime_sub_ns(kt, usec * NSEC_PER_USEC); } static inline ktime_t ktime_sub_ms(const ktime_t kt, const u64 msec) { return ktime_sub_ns(kt, msec * NSEC_PER_MSEC); } extern ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs); /** * ktime_to_timespec64_cond - convert a ktime_t variable to timespec64 * format only if the variable contains data * @kt: the ktime_t variable to convert * @ts: the timespec variable to store the result in * * Return: %true if there was a successful conversion, %false if kt was 0. */ static inline __must_check bool ktime_to_timespec64_cond(const ktime_t kt, struct timespec64 *ts) { if (kt) { *ts = ktime_to_timespec64(kt); return true; } else { return false; } } #include <vdso/ktime.h> static inline ktime_t ns_to_ktime(u64 ns) { return ns; } static inline ktime_t ms_to_ktime(u64 ms) { return ms * NSEC_PER_MSEC; } # include <linux/timekeeping.h> #endif |
311 184 1638 355 3 3 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_RCULIST_NULLS_H #define _LINUX_RCULIST_NULLS_H #ifdef __KERNEL__ /* * RCU-protected list version */ #include <linux/list_nulls.h> #include <linux/rcupdate.h> /** * hlist_nulls_del_init_rcu - deletes entry from hash list with re-initialization * @n: the element to delete from the hash list. * * Note: hlist_nulls_unhashed() on the node return true after this. It is * useful for RCU based read lockfree traversal if the writer side * must know if the list entry is still hashed or already unhashed. * * In particular, it means that we can not poison the forward pointers * that may still be used for walking the hash list and we can only * zero the pprev pointer so list_unhashed() will return true after * this. * * The caller must take whatever precautions are necessary (such as * holding appropriate locks) to avoid racing with another * list-mutation primitive, such as hlist_nulls_add_head_rcu() or * hlist_nulls_del_rcu(), running on this same list. However, it is * perfectly legal to run concurrently with the _rcu list-traversal * primitives, such as hlist_nulls_for_each_entry_rcu(). */ static inline void hlist_nulls_del_init_rcu(struct hlist_nulls_node *n) { if (!hlist_nulls_unhashed(n)) { __hlist_nulls_del(n); WRITE_ONCE(n->pprev, NULL); } } /** * hlist_nulls_first_rcu - returns the first element of the hash list. * @head: the head of the list. */ #define hlist_nulls_first_rcu(head) \ (*((struct hlist_nulls_node __rcu __force **)&(head)->first)) /** * hlist_nulls_next_rcu - returns the element of the list after @node. * @node: element of the list. */ #define hlist_nulls_next_rcu(node) \ (*((struct hlist_nulls_node __rcu __force **)&(node)->next)) /** * hlist_nulls_del_rcu - deletes entry from hash list without re-initialization * @n: the element to delete from the hash list. * * Note: hlist_nulls_unhashed() on entry does not return true after this, * the entry is in an undefined state. It is useful for RCU based * lockfree traversal. * * In particular, it means that we can not poison the forward * pointers that may still be used for walking the hash list. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_nulls_add_head_rcu() * or hlist_nulls_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_nulls_for_each_entry(). */ static inline void hlist_nulls_del_rcu(struct hlist_nulls_node *n) { __hlist_nulls_del(n); WRITE_ONCE(n->pprev, LIST_POISON2); } /** * hlist_nulls_add_head_rcu * @n: the element to add to the hash list. * @h: the list to add to. * * Description: * Adds the specified element to the specified hlist_nulls, * while permitting racing traversals. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_nulls_add_head_rcu() * or hlist_nulls_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_nulls_for_each_entry_rcu(), used to prevent memory-consistency * problems on Alpha CPUs. Regardless of the type of CPU, the * list-traversal primitive must be guarded by rcu_read_lock(). */ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n, struct hlist_nulls_head *h) { struct hlist_nulls_node *first = h->first; WRITE_ONCE(n->next, first); WRITE_ONCE(n->pprev, &h->first); rcu_assign_pointer(hlist_nulls_first_rcu(h), n); if (!is_a_nulls(first)) WRITE_ONCE(first->pprev, &n->next); } /** * hlist_nulls_add_tail_rcu * @n: the element to add to the hash list. * @h: the list to add to. * * Description: * Adds the specified element to the specified hlist_nulls, * while permitting racing traversals. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_nulls_add_head_rcu() * or hlist_nulls_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_nulls_for_each_entry_rcu(), used to prevent memory-consistency * problems on Alpha CPUs. Regardless of the type of CPU, the * list-traversal primitive must be guarded by rcu_read_lock(). */ static inline void hlist_nulls_add_tail_rcu(struct hlist_nulls_node *n, struct hlist_nulls_head *h) { struct hlist_nulls_node *i, *last = NULL; /* Note: write side code, so rcu accessors are not needed. */ for (i = h->first; !is_a_nulls(i); i = i->next) last = i; if (last) { WRITE_ONCE(n->next, last->next); n->pprev = &last->next; rcu_assign_pointer(hlist_nulls_next_rcu(last), n); } else { hlist_nulls_add_head_rcu(n, h); } } /* after that hlist_nulls_del will work */ static inline void hlist_nulls_add_fake(struct hlist_nulls_node *n) { n->pprev = &n->next; n->next = (struct hlist_nulls_node *)NULLS_MARKER(NULL); } /** * hlist_nulls_for_each_entry_rcu - iterate over rcu list of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_nulls_node to use as a loop cursor. * @head: the head of the list. * @member: the name of the hlist_nulls_node within the struct. * * The barrier() is needed to make sure compiler doesn't cache first element [1], * as this loop can be restarted [2] * [1] Documentation/memory-barriers.txt around line 1533 * [2] Documentation/RCU/rculist_nulls.rst around line 146 */ #define hlist_nulls_for_each_entry_rcu(tpos, pos, head, member) \ for (({barrier();}), \ pos = rcu_dereference_raw(hlist_nulls_first_rcu(head)); \ (!is_a_nulls(pos)) && \ ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1; }); \ pos = rcu_dereference_raw(hlist_nulls_next_rcu(pos))) /** * hlist_nulls_for_each_entry_safe - * iterate over list of given type safe against removal of list entry * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_nulls_node to use as a loop cursor. * @head: the head of the list. * @member: the name of the hlist_nulls_node within the struct. */ #define hlist_nulls_for_each_entry_safe(tpos, pos, head, member) \ for (({barrier();}), \ pos = rcu_dereference_raw(hlist_nulls_first_rcu(head)); \ (!is_a_nulls(pos)) && \ ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); \ pos = rcu_dereference_raw(hlist_nulls_next_rcu(pos)); 1; });) #endif #endif |
2 2 11 11 11 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 | // SPDX-License-Identifier: GPL-2.0-or-later /* PKCS#8 Private Key parser [RFC 5208]. * * Copyright (C) 2016 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define pr_fmt(fmt) "PKCS8: "fmt #include <linux/module.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/err.h> #include <linux/oid_registry.h> #include <keys/asymmetric-subtype.h> #include <keys/asymmetric-parser.h> #include <crypto/public_key.h> #include "pkcs8.asn1.h" struct pkcs8_parse_context { struct public_key *pub; unsigned long data; /* Start of data */ enum OID last_oid; /* Last OID encountered */ enum OID algo_oid; /* Algorithm OID */ u32 key_size; const void *key; }; /* * Note an OID when we find one for later processing when we know how to * interpret it. */ int pkcs8_note_OID(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct pkcs8_parse_context *ctx = context; ctx->last_oid = look_up_OID(value, vlen); if (ctx->last_oid == OID__NR) { char buffer[50]; sprint_oid(value, vlen, buffer, sizeof(buffer)); pr_info("Unknown OID: [%lu] %s\n", (unsigned long)value - ctx->data, buffer); } return 0; } /* * Note the version number of the ASN.1 blob. */ int pkcs8_note_version(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { if (vlen != 1 || ((const u8 *)value)[0] != 0) { pr_warn("Unsupported PKCS#8 version\n"); return -EBADMSG; } return 0; } /* * Note the public algorithm. */ int pkcs8_note_algo(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct pkcs8_parse_context *ctx = context; if (ctx->last_oid != OID_rsaEncryption) return -ENOPKG; ctx->pub->pkey_algo = "rsa"; return 0; } /* * Note the key data of the ASN.1 blob. */ int pkcs8_note_key(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct pkcs8_parse_context *ctx = context; ctx->key = value; ctx->key_size = vlen; return 0; } /* * Parse a PKCS#8 private key blob. */ static struct public_key *pkcs8_parse(const void *data, size_t datalen) { struct pkcs8_parse_context ctx; struct public_key *pub; long ret; memset(&ctx, 0, sizeof(ctx)); ret = -ENOMEM; ctx.pub = kzalloc(sizeof(struct public_key), GFP_KERNEL); if (!ctx.pub) goto error; ctx.data = (unsigned long)data; /* Attempt to decode the private key */ ret = asn1_ber_decoder(&pkcs8_decoder, &ctx, data, datalen); if (ret < 0) goto error_decode; ret = -ENOMEM; pub = ctx.pub; pub->key = kmemdup(ctx.key, ctx.key_size, GFP_KERNEL); if (!pub->key) goto error_decode; pub->keylen = ctx.key_size; pub->key_is_private = true; return pub; error_decode: kfree(ctx.pub); error: return ERR_PTR(ret); } /* * Attempt to parse a data blob for a key as a PKCS#8 private key. */ static int pkcs8_key_preparse(struct key_preparsed_payload *prep) { struct public_key *pub; pub = pkcs8_parse(prep->data, prep->datalen); if (IS_ERR(pub)) return PTR_ERR(pub); pr_devel("Cert Key Algo: %s\n", pub->pkey_algo); pub->id_type = "PKCS8"; /* We're pinning the module by being linked against it */ __module_get(public_key_subtype.owner); prep->payload.data[asym_subtype] = &public_key_subtype; prep->payload.data[asym_key_ids] = NULL; prep->payload.data[asym_crypto] = pub; prep->payload.data[asym_auth] = NULL; prep->quotalen = 100; return 0; } static struct asymmetric_key_parser pkcs8_key_parser = { .owner = THIS_MODULE, .name = "pkcs8", .parse = pkcs8_key_preparse, }; /* * Module stuff */ static int __init pkcs8_key_init(void) { return register_asymmetric_key_parser(&pkcs8_key_parser); } static void __exit pkcs8_key_exit(void) { unregister_asymmetric_key_parser(&pkcs8_key_parser); } module_init(pkcs8_key_init); module_exit(pkcs8_key_exit); MODULE_DESCRIPTION("PKCS#8 certificate parser"); MODULE_LICENSE("GPL"); |
1401 2730 79 1191 2547 5 1074 6605 19468 4948 79 133 79 6 51 288 288 59 97 324 597 16 19915 5 20 3 971 13 5 20 13 28 14 52 16 2692 62 2183 265 3013 273 7 25 81 143 66 105 63 8 5 390 19 2 44 15 13 68 25 174 241 1 4628 541 105 81 32 80 792 46 735 749 748 649 67 19835 10328 7 28 97 14555 14548 7017 136 1 68 7042 10 444 6 12393 2384 234 689 35 27 30 426 588 100 957 142 347 2193 2162 36 58 4 549 2 3 33 34 574 1858 4954 12442 6 1826 1490 8 1461 638 3 371 217 217 658 4 73 148 79 2 78 1 300 2 189 10 163 1 2 3 519 333 261 9 210 24 312 10 69 9 2 11 13 11 522 1 1 2 2 45 13068 87 153 406 347 631 9934 530 1 770 617 633 1419 24 1 293 675 21 1 18 22 6 3 3 6 8 280 23 36 15 209 12 2 19683 66 12574 59 167 169 2 12 10805 220 44 50 904 12608 146 146 10388 4 4 79 98 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Definitions for the 'struct sk_buff' memory handlers. * * Authors: * Alan Cox, <gw4pts@gw4pts.ampr.org> * Florian La Roche, <rzsfl@rz.uni-sb.de> */ #ifndef _LINUX_SKBUFF_H #define _LINUX_SKBUFF_H #include <linux/kernel.h> #include <linux/compiler.h> #include <linux/time.h> #include <linux/bug.h> #include <linux/bvec.h> #include <linux/cache.h> #include <linux/rbtree.h> #include <linux/socket.h> #include <linux/refcount.h> #include <linux/atomic.h> #include <asm/types.h> #include <linux/spinlock.h> #include <net/checksum.h> #include <linux/rcupdate.h> #include <linux/dma-mapping.h> #include <linux/netdev_features.h> #include <net/flow_dissector.h> #include <linux/in6.h> #include <linux/if_packet.h> #include <linux/llist.h> #include <net/flow.h> #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <linux/netfilter/nf_conntrack_common.h> #endif #include <net/net_debug.h> #include <net/dropreason-core.h> /** * DOC: skb checksums * * The interface for checksum offload between the stack and networking drivers * is as follows... * * IP checksum related features * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * * Drivers advertise checksum offload capabilities in the features of a device. * From the stack's point of view these are capabilities offered by the driver. * A driver typically only advertises features that it is capable of offloading * to its device. * * .. flat-table:: Checksum related device features * :widths: 1 10 * * * - %NETIF_F_HW_CSUM * - The driver (or its device) is able to compute one * IP (one's complement) checksum for any combination * of protocols or protocol layering. The checksum is * computed and set in a packet per the CHECKSUM_PARTIAL * interface (see below). * * * - %NETIF_F_IP_CSUM * - Driver (device) is only able to checksum plain * TCP or UDP packets over IPv4. These are specifically * unencapsulated packets of the form IPv4|TCP or * IPv4|UDP where the Protocol field in the IPv4 header * is TCP or UDP. The IPv4 header may contain IP options. * This feature cannot be set in features for a device * with NETIF_F_HW_CSUM also set. This feature is being * DEPRECATED (see below). * * * - %NETIF_F_IPV6_CSUM * - Driver (device) is only able to checksum plain * TCP or UDP packets over IPv6. These are specifically * unencapsulated packets of the form IPv6|TCP or * IPv6|UDP where the Next Header field in the IPv6 * header is either TCP or UDP. IPv6 extension headers * are not supported with this feature. This feature * cannot be set in features for a device with * NETIF_F_HW_CSUM also set. This feature is being * DEPRECATED (see below). * * * - %NETIF_F_RXCSUM * - Driver (device) performs receive checksum offload. * This flag is only used to disable the RX checksum * feature for a device. The stack will accept receive * checksum indication in packets received on a device * regardless of whether NETIF_F_RXCSUM is set. * * Checksumming of received packets by device * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * * Indication of checksum verification is set in &sk_buff.ip_summed. * Possible values are: * * - %CHECKSUM_NONE * * Device did not checksum this packet e.g. due to lack of capabilities. * The packet contains full (though not verified) checksum in packet but * not in skb->csum. Thus, skb->csum is undefined in this case. * * - %CHECKSUM_UNNECESSARY * * The hardware you're dealing with doesn't calculate the full checksum * (as in %CHECKSUM_COMPLETE), but it does parse headers and verify checksums * for specific protocols. For such packets it will set %CHECKSUM_UNNECESSARY * if their checksums are okay. &sk_buff.csum is still undefined in this case * though. A driver or device must never modify the checksum field in the * packet even if checksum is verified. * * %CHECKSUM_UNNECESSARY is applicable to following protocols: * * - TCP: IPv6 and IPv4. * - UDP: IPv4 and IPv6. A device may apply CHECKSUM_UNNECESSARY to a * zero UDP checksum for either IPv4 or IPv6, the networking stack * may perform further validation in this case. * - GRE: only if the checksum is present in the header. * - SCTP: indicates the CRC in SCTP header has been validated. * - FCOE: indicates the CRC in FC frame has been validated. * * &sk_buff.csum_level indicates the number of consecutive checksums found in * the packet minus one that have been verified as %CHECKSUM_UNNECESSARY. * For instance if a device receives an IPv6->UDP->GRE->IPv4->TCP packet * and a device is able to verify the checksums for UDP (possibly zero), * GRE (checksum flag is set) and TCP, &sk_buff.csum_level would be set to * two. If the device were only able to verify the UDP checksum and not * GRE, either because it doesn't support GRE checksum or because GRE * checksum is bad, skb->csum_level would be set to zero (TCP checksum is * not considered in this case). * * - %CHECKSUM_COMPLETE * * This is the most generic way. The device supplied checksum of the _whole_ * packet as seen by netif_rx() and fills in &sk_buff.csum. This means the * hardware doesn't need to parse L3/L4 headers to implement this. * * Notes: * * - Even if device supports only some protocols, but is able to produce * skb->csum, it MUST use CHECKSUM_COMPLETE, not CHECKSUM_UNNECESSARY. * - CHECKSUM_COMPLETE is not applicable to SCTP and FCoE protocols. * * - %CHECKSUM_PARTIAL * * A checksum is set up to be offloaded to a device as described in the * output description for CHECKSUM_PARTIAL. This may occur on a packet * received directly from another Linux OS, e.g., a virtualized Linux kernel * on the same host, or it may be set in the input path in GRO or remote * checksum offload. For the purposes of checksum verification, the checksum * referred to by skb->csum_start + skb->csum_offset and any preceding * checksums in the packet are considered verified. Any checksums in the * packet that are after the checksum being offloaded are not considered to * be verified. * * Checksumming on transmit for non-GSO * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * * The stack requests checksum offload in the &sk_buff.ip_summed for a packet. * Values are: * * - %CHECKSUM_PARTIAL * * The driver is required to checksum the packet as seen by hard_start_xmit() * from &sk_buff.csum_start up to the end, and to record/write the checksum at * offset &sk_buff.csum_start + &sk_buff.csum_offset. * A driver may verify that the * csum_start and csum_offset values are valid values given the length and * offset of the packet, but it should not attempt to validate that the * checksum refers to a legitimate transport layer checksum -- it is the * purview of the stack to validate that csum_start and csum_offset are set * correctly. * * When the stack requests checksum offload for a packet, the driver MUST * ensure that the checksum is set correctly. A driver can either offload the * checksum calculation to the device, or call skb_checksum_help (in the case * that the device does not support offload for a particular checksum). * * %NETIF_F_IP_CSUM and %NETIF_F_IPV6_CSUM are being deprecated in favor of * %NETIF_F_HW_CSUM. New devices should use %NETIF_F_HW_CSUM to indicate * checksum offload capability. * skb_csum_hwoffload_help() can be called to resolve %CHECKSUM_PARTIAL based * on network device checksumming capabilities: if a packet does not match * them, skb_checksum_help() or skb_crc32c_help() (depending on the value of * &sk_buff.csum_not_inet, see :ref:`crc`) * is called to resolve the checksum. * * - %CHECKSUM_NONE * * The skb was already checksummed by the protocol, or a checksum is not * required. * * - %CHECKSUM_UNNECESSARY * * This has the same meaning as CHECKSUM_NONE for checksum offload on * output. * * - %CHECKSUM_COMPLETE * * Not used in checksum output. If a driver observes a packet with this value * set in skbuff, it should treat the packet as if %CHECKSUM_NONE were set. * * .. _crc: * * Non-IP checksum (CRC) offloads * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * * .. flat-table:: * :widths: 1 10 * * * - %NETIF_F_SCTP_CRC * - This feature indicates that a device is capable of * offloading the SCTP CRC in a packet. To perform this offload the stack * will set csum_start and csum_offset accordingly, set ip_summed to * %CHECKSUM_PARTIAL and set csum_not_inet to 1, to provide an indication * in the skbuff that the %CHECKSUM_PARTIAL refers to CRC32c. * A driver that supports both IP checksum offload and SCTP CRC32c offload * must verify which offload is configured for a packet by testing the * value of &sk_buff.csum_not_inet; skb_crc32c_csum_help() is provided to * resolve %CHECKSUM_PARTIAL on skbs where csum_not_inet is set to 1. * * * - %NETIF_F_FCOE_CRC * - This feature indicates that a device is capable of offloading the FCOE * CRC in a packet. To perform this offload the stack will set ip_summed * to %CHECKSUM_PARTIAL and set csum_start and csum_offset * accordingly. Note that there is no indication in the skbuff that the * %CHECKSUM_PARTIAL refers to an FCOE checksum, so a driver that supports * both IP checksum offload and FCOE CRC offload must verify which offload * is configured for a packet, presumably by inspecting packet headers. * * Checksumming on output with GSO * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * * In the case of a GSO packet (skb_is_gso() is true), checksum offload * is implied by the SKB_GSO_* flags in gso_type. Most obviously, if the * gso_type is %SKB_GSO_TCPV4 or %SKB_GSO_TCPV6, TCP checksum offload as * part of the GSO operation is implied. If a checksum is being offloaded * with GSO then ip_summed is %CHECKSUM_PARTIAL, and both csum_start and * csum_offset are set to refer to the outermost checksum being offloaded * (two offloaded checksums are possible with UDP encapsulation). */ /* Don't change this without changing skb_csum_unnecessary! */ #define CHECKSUM_NONE 0 #define CHECKSUM_UNNECESSARY 1 #define CHECKSUM_COMPLETE 2 #define CHECKSUM_PARTIAL 3 /* Maximum value in skb->csum_level */ #define SKB_MAX_CSUM_LEVEL 3 #define SKB_DATA_ALIGN(X) ALIGN(X, SMP_CACHE_BYTES) #define SKB_WITH_OVERHEAD(X) \ ((X) - SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) /* For X bytes available in skb->head, what is the minimal * allocation needed, knowing struct skb_shared_info needs * to be aligned. */ #define SKB_HEAD_ALIGN(X) (SKB_DATA_ALIGN(X) + \ SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) #define SKB_MAX_ORDER(X, ORDER) \ SKB_WITH_OVERHEAD((PAGE_SIZE << (ORDER)) - (X)) #define SKB_MAX_HEAD(X) (SKB_MAX_ORDER((X), 0)) #define SKB_MAX_ALLOC (SKB_MAX_ORDER(0, 2)) /* return minimum truesize of one skb containing X bytes of data */ #define SKB_TRUESIZE(X) ((X) + \ SKB_DATA_ALIGN(sizeof(struct sk_buff)) + \ SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) struct ahash_request; struct net_device; struct scatterlist; struct pipe_inode_info; struct iov_iter; struct napi_struct; struct bpf_prog; union bpf_attr; struct skb_ext; struct ts_config; #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) struct nf_bridge_info { enum { BRNF_PROTO_UNCHANGED, BRNF_PROTO_8021Q, BRNF_PROTO_PPPOE } orig_proto:8; u8 pkt_otherhost:1; u8 in_prerouting:1; u8 bridged_dnat:1; u8 sabotage_in_done:1; __u16 frag_max_size; int physinif; /* always valid & non-NULL from FORWARD on, for physdev match */ struct net_device *physoutdev; union { /* prerouting: detect dnat in orig/reply direction */ __be32 ipv4_daddr; struct in6_addr ipv6_daddr; /* after prerouting + nat detected: store original source * mac since neigh resolution overwrites it, only used while * skb is out in neigh layer. */ char neigh_header[8]; }; }; #endif #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) /* Chain in tc_skb_ext will be used to share the tc chain with * ovs recirc_id. It will be set to the current chain by tc * and read by ovs to recirc_id. */ struct tc_skb_ext { union { u64 act_miss_cookie; __u32 chain; }; __u16 mru; __u16 zone; u8 post_ct:1; u8 post_ct_snat:1; u8 post_ct_dnat:1; u8 act_miss:1; /* Set if act_miss_cookie is used */ u8 l2_miss:1; /* Set by bridge upon FDB or MDB miss */ }; #endif struct sk_buff_head { /* These two members must be first to match sk_buff. */ struct_group_tagged(sk_buff_list, list, struct sk_buff *next; struct sk_buff *prev; ); __u32 qlen; spinlock_t lock; }; struct sk_buff; #ifndef CONFIG_MAX_SKB_FRAGS # define CONFIG_MAX_SKB_FRAGS 17 #endif #define MAX_SKB_FRAGS CONFIG_MAX_SKB_FRAGS extern int sysctl_max_skb_frags; /* Set skb_shinfo(skb)->gso_size to this in case you want skb_segment to * segment using its current segmentation instead. */ #define GSO_BY_FRAGS 0xFFFF typedef struct bio_vec skb_frag_t; /** * skb_frag_size() - Returns the size of a skb fragment * @frag: skb fragment */ static inline unsigned int skb_frag_size(const skb_frag_t *frag) { return frag->bv_len; } /** * skb_frag_size_set() - Sets the size of a skb fragment * @frag: skb fragment * @size: size of fragment */ static inline void skb_frag_size_set(skb_frag_t *frag, unsigned int size) { frag->bv_len = size; } /** * skb_frag_size_add() - Increments the size of a skb fragment by @delta * @frag: skb fragment * @delta: value to add */ static inline void skb_frag_size_add(skb_frag_t *frag, int delta) { frag->bv_len += delta; } /** * skb_frag_size_sub() - Decrements the size of a skb fragment by @delta * @frag: skb fragment * @delta: value to subtract */ static inline void skb_frag_size_sub(skb_frag_t *frag, int delta) { frag->bv_len -= delta; } /** * skb_frag_must_loop - Test if %p is a high memory page * @p: fragment's page */ static inline bool skb_frag_must_loop(struct page *p) { #if defined(CONFIG_HIGHMEM) if (IS_ENABLED(CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP) || PageHighMem(p)) return true; #endif return false; } /** * skb_frag_foreach_page - loop over pages in a fragment * * @f: skb frag to operate on * @f_off: offset from start of f->bv_page * @f_len: length from f_off to loop over * @p: (temp var) current page * @p_off: (temp var) offset from start of current page, * non-zero only on first page. * @p_len: (temp var) length in current page, * < PAGE_SIZE only on first and last page. * @copied: (temp var) length so far, excluding current p_len. * * A fragment can hold a compound page, in which case per-page * operations, notably kmap_atomic, must be called for each * regular page. */ #define skb_frag_foreach_page(f, f_off, f_len, p, p_off, p_len, copied) \ for (p = skb_frag_page(f) + ((f_off) >> PAGE_SHIFT), \ p_off = (f_off) & (PAGE_SIZE - 1), \ p_len = skb_frag_must_loop(p) ? \ min_t(u32, f_len, PAGE_SIZE - p_off) : f_len, \ copied = 0; \ copied < f_len; \ copied += p_len, p++, p_off = 0, \ p_len = min_t(u32, f_len - copied, PAGE_SIZE)) \ /** * struct skb_shared_hwtstamps - hardware time stamps * @hwtstamp: hardware time stamp transformed into duration * since arbitrary point in time * @netdev_data: address/cookie of network device driver used as * reference to actual hardware time stamp * * Software time stamps generated by ktime_get_real() are stored in * skb->tstamp. * * hwtstamps can only be compared against other hwtstamps from * the same device. * * This structure is attached to packets as part of the * &skb_shared_info. Use skb_hwtstamps() to get a pointer. */ struct skb_shared_hwtstamps { union { ktime_t hwtstamp; void *netdev_data; }; }; /* Definitions for tx_flags in struct skb_shared_info */ enum { /* generate hardware time stamp */ SKBTX_HW_TSTAMP = 1 << 0, /* generate software time stamp when queueing packet to NIC */ SKBTX_SW_TSTAMP = 1 << 1, /* device driver is going to provide hardware time stamp */ SKBTX_IN_PROGRESS = 1 << 2, /* generate hardware time stamp based on cycles if supported */ SKBTX_HW_TSTAMP_USE_CYCLES = 1 << 3, /* generate wifi status information (where possible) */ SKBTX_WIFI_STATUS = 1 << 4, /* determine hardware time stamp based on time or cycles */ SKBTX_HW_TSTAMP_NETDEV = 1 << 5, /* generate software time stamp when entering packet scheduling */ SKBTX_SCHED_TSTAMP = 1 << 6, }; #define SKBTX_ANY_SW_TSTAMP (SKBTX_SW_TSTAMP | \ SKBTX_SCHED_TSTAMP) #define SKBTX_ANY_TSTAMP (SKBTX_HW_TSTAMP | \ SKBTX_HW_TSTAMP_USE_CYCLES | \ SKBTX_ANY_SW_TSTAMP) /* Definitions for flags in struct skb_shared_info */ enum { /* use zcopy routines */ SKBFL_ZEROCOPY_ENABLE = BIT(0), /* This indicates at least one fragment might be overwritten * (as in vmsplice(), sendfile() ...) * If we need to compute a TX checksum, we'll need to copy * all frags to avoid possible bad checksum */ SKBFL_SHARED_FRAG = BIT(1), /* segment contains only zerocopy data and should not be * charged to the kernel memory. */ SKBFL_PURE_ZEROCOPY = BIT(2), SKBFL_DONT_ORPHAN = BIT(3), /* page references are managed by the ubuf_info, so it's safe to * use frags only up until ubuf_info is released */ SKBFL_MANAGED_FRAG_REFS = BIT(4), }; #define SKBFL_ZEROCOPY_FRAG (SKBFL_ZEROCOPY_ENABLE | SKBFL_SHARED_FRAG) #define SKBFL_ALL_ZEROCOPY (SKBFL_ZEROCOPY_FRAG | SKBFL_PURE_ZEROCOPY | \ SKBFL_DONT_ORPHAN | SKBFL_MANAGED_FRAG_REFS) /* * The callback notifies userspace to release buffers when skb DMA is done in * lower device, the skb last reference should be 0 when calling this. * The zerocopy_success argument is true if zero copy transmit occurred, * false on data copy or out of memory error caused by data copy attempt. * The ctx field is used to track device context. * The desc field is used to track userspace buffer index. */ struct ubuf_info { void (*callback)(struct sk_buff *, struct ubuf_info *, bool zerocopy_success); refcount_t refcnt; u8 flags; }; struct ubuf_info_msgzc { struct ubuf_info ubuf; union { struct { unsigned long desc; void *ctx; }; struct { u32 id; u16 len; u16 zerocopy:1; u32 bytelen; }; }; struct mmpin { struct user_struct *user; unsigned int num_pg; } mmp; }; #define skb_uarg(SKB) ((struct ubuf_info *)(skb_shinfo(SKB)->destructor_arg)) #define uarg_to_msgzc(ubuf_ptr) container_of((ubuf_ptr), struct ubuf_info_msgzc, \ ubuf) int mm_account_pinned_pages(struct mmpin *mmp, size_t size); void mm_unaccount_pinned_pages(struct mmpin *mmp); /* Preserve some data across TX submission and completion. * * Note, this state is stored in the driver. Extending the layout * might need some special care. */ struct xsk_tx_metadata_compl { __u64 *tx_timestamp; }; /* This data is invariant across clones and lives at * the end of the header data, ie. at skb->end. */ struct skb_shared_info { __u8 flags; __u8 meta_len; __u8 nr_frags; __u8 tx_flags; unsigned short gso_size; /* Warning: this field is not always filled in (UFO)! */ unsigned short gso_segs; struct sk_buff *frag_list; union { struct skb_shared_hwtstamps hwtstamps; struct xsk_tx_metadata_compl xsk_meta; }; unsigned int gso_type; u32 tskey; /* * Warning : all fields before dataref are cleared in __alloc_skb() */ atomic_t dataref; unsigned int xdp_frags_size; /* Intermediate layers must ensure that destructor_arg * remains valid until skb destructor */ void * destructor_arg; /* must be last field, see pskb_expand_head() */ skb_frag_t frags[MAX_SKB_FRAGS]; }; /** * DOC: dataref and headerless skbs * * Transport layers send out clones of payload skbs they hold for * retransmissions. To allow lower layers of the stack to prepend their headers * we split &skb_shared_info.dataref into two halves. * The lower 16 bits count the overall number of references. * The higher 16 bits indicate how many of the references are payload-only. * skb_header_cloned() checks if skb is allowed to add / write the headers. * * The creator of the skb (e.g. TCP) marks its skb as &sk_buff.nohdr * (via __skb_header_release()). Any clone created from marked skb will get * &sk_buff.hdr_len populated with the available headroom. * If there's the only clone in existence it's able to modify the headroom * at will. The sequence of calls inside the transport layer is:: * * <alloc skb> * skb_reserve() * __skb_header_release() * skb_clone() * // send the clone down the stack * * This is not a very generic construct and it depends on the transport layers * doing the right thing. In practice there's usually only one payload-only skb. * Having multiple payload-only skbs with different lengths of hdr_len is not * possible. The payload-only skbs should never leave their owner. */ #define SKB_DATAREF_SHIFT 16 #define SKB_DATAREF_MASK ((1 << SKB_DATAREF_SHIFT) - 1) enum { SKB_FCLONE_UNAVAILABLE, /* skb has no fclone (from head_cache) */ SKB_FCLONE_ORIG, /* orig skb (from fclone_cache) */ SKB_FCLONE_CLONE, /* companion fclone skb (from fclone_cache) */ }; enum { SKB_GSO_TCPV4 = 1 << 0, /* This indicates the skb is from an untrusted source. */ SKB_GSO_DODGY = 1 << 1, /* This indicates the tcp segment has CWR set. */ SKB_GSO_TCP_ECN = 1 << 2, SKB_GSO_TCP_FIXEDID = 1 << 3, SKB_GSO_TCPV6 = 1 << 4, SKB_GSO_FCOE = 1 << 5, SKB_GSO_GRE = 1 << 6, SKB_GSO_GRE_CSUM = 1 << 7, SKB_GSO_IPXIP4 = 1 << 8, SKB_GSO_IPXIP6 = 1 << 9, SKB_GSO_UDP_TUNNEL = 1 << 10, SKB_GSO_UDP_TUNNEL_CSUM = 1 << 11, SKB_GSO_PARTIAL = 1 << 12, SKB_GSO_TUNNEL_REMCSUM = 1 << 13, SKB_GSO_SCTP = 1 << 14, SKB_GSO_ESP = 1 << 15, SKB_GSO_UDP = 1 << 16, SKB_GSO_UDP_L4 = 1 << 17, SKB_GSO_FRAGLIST = 1 << 18, }; #if BITS_PER_LONG > 32 #define NET_SKBUFF_DATA_USES_OFFSET 1 #endif #ifdef NET_SKBUFF_DATA_USES_OFFSET typedef unsigned int sk_buff_data_t; #else typedef unsigned char *sk_buff_data_t; #endif /** * DOC: Basic sk_buff geometry * * struct sk_buff itself is a metadata structure and does not hold any packet * data. All the data is held in associated buffers. * * &sk_buff.head points to the main "head" buffer. The head buffer is divided * into two parts: * * - data buffer, containing headers and sometimes payload; * this is the part of the skb operated on by the common helpers * such as skb_put() or skb_pull(); * - shared info (struct skb_shared_info) which holds an array of pointers * to read-only data in the (page, offset, length) format. * * Optionally &skb_shared_info.frag_list may point to another skb. * * Basic diagram may look like this:: * * --------------- * | sk_buff | * --------------- * ,--------------------------- + head * / ,----------------- + data * / / ,----------- + tail * | | | , + end * | | | | * v v v v * ----------------------------------------------- * | headroom | data | tailroom | skb_shared_info | * ----------------------------------------------- * + [page frag] * + [page frag] * + [page frag] * + [page frag] --------- * + frag_list --> | sk_buff | * --------- * */ /** * struct sk_buff - socket buffer * @next: Next buffer in list * @prev: Previous buffer in list * @tstamp: Time we arrived/left * @skb_mstamp_ns: (aka @tstamp) earliest departure time; start point * for retransmit timer * @rbnode: RB tree node, alternative to next/prev for netem/tcp * @list: queue head * @ll_node: anchor in an llist (eg socket defer_list) * @sk: Socket we are owned by * @ip_defrag_offset: (aka @sk) alternate use of @sk, used in * fragmentation management * @dev: Device we arrived on/are leaving by * @dev_scratch: (aka @dev) alternate use of @dev when @dev would be %NULL * @cb: Control buffer. Free for use by every layer. Put private vars here * @_skb_refdst: destination entry (with norefcount bit) * @len: Length of actual data * @data_len: Data length * @mac_len: Length of link layer header * @hdr_len: writable header length of cloned skb * @csum: Checksum (must include start/offset pair) * @csum_start: Offset from skb->head where checksumming should start * @csum_offset: Offset from csum_start where checksum should be stored * @priority: Packet queueing priority * @ignore_df: allow local fragmentation * @cloned: Head may be cloned (check refcnt to be sure) * @ip_summed: Driver fed us an IP checksum * @nohdr: Payload reference only, must not modify header * @pkt_type: Packet class * @fclone: skbuff clone status * @ipvs_property: skbuff is owned by ipvs * @inner_protocol_type: whether the inner protocol is * ENCAP_TYPE_ETHER or ENCAP_TYPE_IPPROTO * @remcsum_offload: remote checksum offload is enabled * @offload_fwd_mark: Packet was L2-forwarded in hardware * @offload_l3_fwd_mark: Packet was L3-forwarded in hardware * @tc_skip_classify: do not classify packet. set by IFB device * @tc_at_ingress: used within tc_classify to distinguish in/egress * @redirected: packet was redirected by packet classifier * @from_ingress: packet was redirected from the ingress path * @nf_skip_egress: packet shall skip nf egress - see netfilter_netdev.h * @peeked: this packet has been seen already, so stats have been * done for it, don't do them again * @nf_trace: netfilter packet trace flag * @protocol: Packet protocol from driver * @destructor: Destruct function * @tcp_tsorted_anchor: list structure for TCP (tp->tsorted_sent_queue) * @_sk_redir: socket redirection information for skmsg * @_nfct: Associated connection, if any (with nfctinfo bits) * @skb_iif: ifindex of device we arrived on * @tc_index: Traffic control index * @hash: the packet hash * @queue_mapping: Queue mapping for multiqueue devices * @head_frag: skb was allocated from page fragments, * not allocated by kmalloc() or vmalloc(). * @pfmemalloc: skbuff was allocated from PFMEMALLOC reserves * @pp_recycle: mark the packet for recycling instead of freeing (implies * page_pool support on driver) * @active_extensions: active extensions (skb_ext_id types) * @ndisc_nodetype: router type (from link layer) * @ooo_okay: allow the mapping of a socket to a queue to be changed * @l4_hash: indicate hash is a canonical 4-tuple hash over transport * ports. * @sw_hash: indicates hash was computed in software stack * @wifi_acked_valid: wifi_acked was set * @wifi_acked: whether frame was acked on wifi or not * @no_fcs: Request NIC to treat last 4 bytes as Ethernet FCS * @encapsulation: indicates the inner headers in the skbuff are valid * @encap_hdr_csum: software checksum is needed * @csum_valid: checksum is already valid * @csum_not_inet: use CRC32c to resolve CHECKSUM_PARTIAL * @csum_complete_sw: checksum was completed by software * @csum_level: indicates the number of consecutive checksums found in * the packet minus one that have been verified as * CHECKSUM_UNNECESSARY (max 3) * @dst_pending_confirm: need to confirm neighbour * @decrypted: Decrypted SKB * @slow_gro: state present at GRO time, slower prepare step required * @mono_delivery_time: When set, skb->tstamp has the * delivery_time in mono clock base (i.e. EDT). Otherwise, the * skb->tstamp has the (rcv) timestamp at ingress and * delivery_time at egress. * @napi_id: id of the NAPI struct this skb came from * @sender_cpu: (aka @napi_id) source CPU in XPS * @alloc_cpu: CPU which did the skb allocation. * @secmark: security marking * @mark: Generic packet mark * @reserved_tailroom: (aka @mark) number of bytes of free space available * at the tail of an sk_buff * @vlan_all: vlan fields (proto & tci) * @vlan_proto: vlan encapsulation protocol * @vlan_tci: vlan tag control information * @inner_protocol: Protocol (encapsulation) * @inner_ipproto: (aka @inner_protocol) stores ipproto when * skb->inner_protocol_type == ENCAP_TYPE_IPPROTO; * @inner_transport_header: Inner transport layer header (encapsulation) * @inner_network_header: Network layer header (encapsulation) * @inner_mac_header: Link layer header (encapsulation) * @transport_header: Transport layer header * @network_header: Network layer header * @mac_header: Link layer header * @kcov_handle: KCOV remote handle for remote coverage collection * @tail: Tail pointer * @end: End pointer * @head: Head of buffer * @data: Data head pointer * @truesize: Buffer size * @users: User count - see {datagram,tcp}.c * @extensions: allocated extensions, valid if active_extensions is nonzero */ struct sk_buff { union { struct { /* These two members must be first to match sk_buff_head. */ struct sk_buff *next; struct sk_buff *prev; union { struct net_device *dev; /* Some protocols might use this space to store information, * while device pointer would be NULL. * UDP receive path is one user. */ unsigned long dev_scratch; }; }; struct rb_node rbnode; /* used in netem, ip4 defrag, and tcp stack */ struct list_head list; struct llist_node ll_node; }; union { struct sock *sk; int ip_defrag_offset; }; union { ktime_t tstamp; u64 skb_mstamp_ns; /* earliest departure time */ }; /* * This is the control buffer. It is free to use for every * layer. Please put your private variables there. If you * want to keep them across layers you have to do a skb_clone() * first. This is owned by whoever has the skb queued ATM. */ char cb[48] __aligned(8); union { struct { unsigned long _skb_refdst; void (*destructor)(struct sk_buff *skb); }; struct list_head tcp_tsorted_anchor; #ifdef CONFIG_NET_SOCK_MSG unsigned long _sk_redir; #endif }; #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) unsigned long _nfct; #endif unsigned int len, data_len; __u16 mac_len, hdr_len; /* Following fields are _not_ copied in __copy_skb_header() * Note that queue_mapping is here mostly to fill a hole. */ __u16 queue_mapping; /* if you move cloned around you also must adapt those constants */ #ifdef __BIG_ENDIAN_BITFIELD #define CLONED_MASK (1 << 7) #else #define CLONED_MASK 1 #endif #define CLONED_OFFSET offsetof(struct sk_buff, __cloned_offset) /* private: */ __u8 __cloned_offset[0]; /* public: */ __u8 cloned:1, nohdr:1, fclone:2, peeked:1, head_frag:1, pfmemalloc:1, pp_recycle:1; /* page_pool recycle indicator */ #ifdef CONFIG_SKB_EXTENSIONS __u8 active_extensions; #endif /* Fields enclosed in headers group are copied * using a single memcpy() in __copy_skb_header() */ struct_group(headers, /* private: */ __u8 __pkt_type_offset[0]; /* public: */ __u8 pkt_type:3; /* see PKT_TYPE_MAX */ __u8 ignore_df:1; __u8 dst_pending_confirm:1; __u8 ip_summed:2; __u8 ooo_okay:1; /* private: */ __u8 __mono_tc_offset[0]; /* public: */ __u8 mono_delivery_time:1; /* See SKB_MONO_DELIVERY_TIME_MASK */ #ifdef CONFIG_NET_XGRESS __u8 tc_at_ingress:1; /* See TC_AT_INGRESS_MASK */ __u8 tc_skip_classify:1; #endif __u8 remcsum_offload:1; __u8 csum_complete_sw:1; __u8 csum_level:2; __u8 inner_protocol_type:1; __u8 l4_hash:1; __u8 sw_hash:1; #ifdef CONFIG_WIRELESS __u8 wifi_acked_valid:1; __u8 wifi_acked:1; #endif __u8 no_fcs:1; /* Indicates the inner headers are valid in the skbuff. */ __u8 encapsulation:1; __u8 encap_hdr_csum:1; __u8 csum_valid:1; #ifdef CONFIG_IPV6_NDISC_NODETYPE __u8 ndisc_nodetype:2; #endif #if IS_ENABLED(CONFIG_IP_VS) __u8 ipvs_property:1; #endif #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || IS_ENABLED(CONFIG_NF_TABLES) __u8 nf_trace:1; #endif #ifdef CONFIG_NET_SWITCHDEV __u8 offload_fwd_mark:1; __u8 offload_l3_fwd_mark:1; #endif __u8 redirected:1; #ifdef CONFIG_NET_REDIRECT __u8 from_ingress:1; #endif #ifdef CONFIG_NETFILTER_SKIP_EGRESS __u8 nf_skip_egress:1; #endif #ifdef CONFIG_TLS_DEVICE __u8 decrypted:1; #endif __u8 slow_gro:1; #if IS_ENABLED(CONFIG_IP_SCTP) __u8 csum_not_inet:1; #endif #if defined(CONFIG_NET_SCHED) || defined(CONFIG_NET_XGRESS) __u16 tc_index; /* traffic control index */ #endif u16 alloc_cpu; union { __wsum csum; struct { __u16 csum_start; __u16 csum_offset; }; }; __u32 priority; int skb_iif; __u32 hash; union { u32 vlan_all; struct { __be16 vlan_proto; __u16 vlan_tci; }; }; #if defined(CONFIG_NET_RX_BUSY_POLL) || defined(CONFIG_XPS) union { unsigned int napi_id; unsigned int sender_cpu; }; #endif #ifdef CONFIG_NETWORK_SECMARK __u32 secmark; #endif union { __u32 mark; __u32 reserved_tailroom; }; union { __be16 inner_protocol; __u8 inner_ipproto; }; __u16 inner_transport_header; __u16 inner_network_header; __u16 inner_mac_header; __be16 protocol; __u16 transport_header; __u16 network_header; __u16 mac_header; #ifdef CONFIG_KCOV u64 kcov_handle; #endif ); /* end headers group */ /* These elements must be at the end, see alloc_skb() for details. */ sk_buff_data_t tail; sk_buff_data_t end; unsigned char *head, *data; unsigned int truesize; refcount_t users; #ifdef CONFIG_SKB_EXTENSIONS /* only usable after checking ->active_extensions != 0 */ struct skb_ext *extensions; #endif }; /* if you move pkt_type around you also must adapt those constants */ #ifdef __BIG_ENDIAN_BITFIELD #define PKT_TYPE_MAX (7 << 5) #else #define PKT_TYPE_MAX 7 #endif #define PKT_TYPE_OFFSET offsetof(struct sk_buff, __pkt_type_offset) /* if you move tc_at_ingress or mono_delivery_time * around, you also must adapt these constants. */ #ifdef __BIG_ENDIAN_BITFIELD #define SKB_MONO_DELIVERY_TIME_MASK (1 << 7) #define TC_AT_INGRESS_MASK (1 << 6) #else #define SKB_MONO_DELIVERY_TIME_MASK (1 << 0) #define TC_AT_INGRESS_MASK (1 << 1) #endif #define SKB_BF_MONO_TC_OFFSET offsetof(struct sk_buff, __mono_tc_offset) #ifdef __KERNEL__ /* * Handling routines are only of interest to the kernel */ #define SKB_ALLOC_FCLONE 0x01 #define SKB_ALLOC_RX 0x02 #define SKB_ALLOC_NAPI 0x04 /** * skb_pfmemalloc - Test if the skb was allocated from PFMEMALLOC reserves * @skb: buffer */ static inline bool skb_pfmemalloc(const struct sk_buff *skb) { return unlikely(skb->pfmemalloc); } /* * skb might have a dst pointer attached, refcounted or not. * _skb_refdst low order bit is set if refcount was _not_ taken */ #define SKB_DST_NOREF 1UL #define SKB_DST_PTRMASK ~(SKB_DST_NOREF) /** * skb_dst - returns skb dst_entry * @skb: buffer * * Returns skb dst_entry, regardless of reference taken or not. */ static inline struct dst_entry *skb_dst(const struct sk_buff *skb) { /* If refdst was not refcounted, check we still are in a * rcu_read_lock section */ WARN_ON((skb->_skb_refdst & SKB_DST_NOREF) && !rcu_read_lock_held() && !rcu_read_lock_bh_held()); return (struct dst_entry *)(skb->_skb_refdst & SKB_DST_PTRMASK); } /** * skb_dst_set - sets skb dst * @skb: buffer * @dst: dst entry * * Sets skb dst, assuming a reference was taken on dst and should * be released by skb_dst_drop() */ static inline void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst) { skb->slow_gro |= !!dst; skb->_skb_refdst = (unsigned long)dst; } /** * skb_dst_set_noref - sets skb dst, hopefully, without taking reference * @skb: buffer * @dst: dst entry * * Sets skb dst, assuming a reference was not taken on dst. * If dst entry is cached, we do not take reference and dst_release * will be avoided by refdst_drop. If dst entry is not cached, we take * reference, so that last dst_release can destroy the dst immediately. */ static inline void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst) { WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); skb->slow_gro |= !!dst; skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF; } /** * skb_dst_is_noref - Test if skb dst isn't refcounted * @skb: buffer */ static inline bool skb_dst_is_noref(const struct sk_buff *skb) { return (skb->_skb_refdst & SKB_DST_NOREF) && skb_dst(skb); } /** * skb_rtable - Returns the skb &rtable * @skb: buffer */ static inline struct rtable *skb_rtable(const struct sk_buff *skb) { return (struct rtable *)skb_dst(skb); } /* For mangling skb->pkt_type from user space side from applications * such as nft, tc, etc, we only allow a conservative subset of * possible pkt_types to be set. */ static inline bool skb_pkt_type_ok(u32 ptype) { return ptype <= PACKET_OTHERHOST; } /** * skb_napi_id - Returns the skb's NAPI id * @skb: buffer */ static inline unsigned int skb_napi_id(const struct sk_buff *skb) { #ifdef CONFIG_NET_RX_BUSY_POLL return skb->napi_id; #else return 0; #endif } static inline bool skb_wifi_acked_valid(const struct sk_buff *skb) { #ifdef CONFIG_WIRELESS return skb->wifi_acked_valid; #else return 0; #endif } /** * skb_unref - decrement the skb's reference count * @skb: buffer * * Returns true if we can free the skb. */ static inline bool skb_unref(struct sk_buff *skb) { if (unlikely(!skb)) return false; if (likely(refcount_read(&skb->users) == 1)) smp_rmb(); else if (likely(!refcount_dec_and_test(&skb->users))) return false; return true; } void __fix_address kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason); /** * kfree_skb - free an sk_buff with 'NOT_SPECIFIED' reason * @skb: buffer to free */ static inline void kfree_skb(struct sk_buff *skb) { kfree_skb_reason(skb, SKB_DROP_REASON_NOT_SPECIFIED); } void skb_release_head_state(struct sk_buff *skb); void kfree_skb_list_reason(struct sk_buff *segs, enum skb_drop_reason reason); void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt); void skb_tx_error(struct sk_buff *skb); static inline void kfree_skb_list(struct sk_buff *segs) { kfree_skb_list_reason(segs, SKB_DROP_REASON_NOT_SPECIFIED); } #ifdef CONFIG_TRACEPOINTS void consume_skb(struct sk_buff *skb); #else static inline void consume_skb(struct sk_buff *skb) { return kfree_skb(skb); } #endif void __consume_stateless_skb(struct sk_buff *skb); void __kfree_skb(struct sk_buff *skb); extern struct kmem_cache *skbuff_cache; void kfree_skb_partial(struct sk_buff *skb, bool head_stolen); bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, bool *fragstolen, int *delta_truesize); struct sk_buff *__alloc_skb(unsigned int size, gfp_t priority, int flags, int node); struct sk_buff *__build_skb(void *data, unsigned int frag_size); struct sk_buff *build_skb(void *data, unsigned int frag_size); struct sk_buff *build_skb_around(struct sk_buff *skb, void *data, unsigned int frag_size); void skb_attempt_defer_free(struct sk_buff *skb); struct sk_buff *napi_build_skb(void *data, unsigned int frag_size); struct sk_buff *slab_build_skb(void *data); /** * alloc_skb - allocate a network buffer * @size: size to allocate * @priority: allocation mask * * This function is a convenient wrapper around __alloc_skb(). */ static inline struct sk_buff *alloc_skb(unsigned int size, gfp_t priority) { return __alloc_skb(size, priority, 0, NUMA_NO_NODE); } struct sk_buff *alloc_skb_with_frags(unsigned long header_len, unsigned long data_len, int max_page_order, int *errcode, gfp_t gfp_mask); struct sk_buff *alloc_skb_for_msg(struct sk_buff *first); /* Layout of fast clones : [skb1][skb2][fclone_ref] */ struct sk_buff_fclones { struct sk_buff skb1; struct sk_buff skb2; refcount_t fclone_ref; }; /** * skb_fclone_busy - check if fclone is busy * @sk: socket * @skb: buffer * * Returns true if skb is a fast clone, and its clone is not freed. * Some drivers call skb_orphan() in their ndo_start_xmit(), * so we also check that didn't happen. */ static inline bool skb_fclone_busy(const struct sock *sk, const struct sk_buff *skb) { const struct sk_buff_fclones *fclones; fclones = container_of(skb, struct sk_buff_fclones, skb1); return skb->fclone == SKB_FCLONE_ORIG && refcount_read(&fclones->fclone_ref) > 1 && READ_ONCE(fclones->skb2.sk) == sk; } /** * alloc_skb_fclone - allocate a network buffer from fclone cache * @size: size to allocate * @priority: allocation mask * * This function is a convenient wrapper around __alloc_skb(). */ static inline struct sk_buff *alloc_skb_fclone(unsigned int size, gfp_t priority) { return __alloc_skb(size, priority, SKB_ALLOC_FCLONE, NUMA_NO_NODE); } struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src); void skb_headers_offset_update(struct sk_buff *skb, int off); int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask); struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t priority); void skb_copy_header(struct sk_buff *new, const struct sk_buff *old); struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t priority); struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom, gfp_t gfp_mask, bool fclone); static inline struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, gfp_t gfp_mask) { return __pskb_copy_fclone(skb, headroom, gfp_mask, false); } int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, gfp_t gfp_mask); struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom); struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom); struct sk_buff *skb_copy_expand(const struct sk_buff *skb, int newheadroom, int newtailroom, gfp_t priority); int __must_check skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg, int offset, int len); int __must_check skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len); int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer); int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error); /** * skb_pad - zero pad the tail of an skb * @skb: buffer to pad * @pad: space to pad * * Ensure that a buffer is followed by a padding area that is zero * filled. Used by network drivers which may DMA or transfer data * beyond the buffer end onto the wire. * * May return error in out of memory cases. The skb is freed on error. */ static inline int skb_pad(struct sk_buff *skb, int pad) { return __skb_pad(skb, pad, true); } #define dev_kfree_skb(a) consume_skb(a) int skb_append_pagefrags(struct sk_buff *skb, struct page *page, int offset, size_t size, size_t max_frags); struct skb_seq_state { __u32 lower_offset; __u32 upper_offset; __u32 frag_idx; __u32 stepped_offset; struct sk_buff *root_skb; struct sk_buff *cur_skb; __u8 *frag_data; __u32 frag_off; }; void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from, unsigned int to, struct skb_seq_state *st); unsigned int skb_seq_read(unsigned int consumed, const u8 **data, struct skb_seq_state *st); void skb_abort_seq_read(struct skb_seq_state *st); unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, unsigned int to, struct ts_config *config); /* * Packet hash types specify the type of hash in skb_set_hash. * * Hash types refer to the protocol layer addresses which are used to * construct a packet's hash. The hashes are used to differentiate or identify * flows of the protocol layer for the hash type. Hash types are either * layer-2 (L2), layer-3 (L3), or layer-4 (L4). * * Properties of hashes: * * 1) Two packets in different flows have different hash values * 2) Two packets in the same flow should have the same hash value * * A hash at a higher layer is considered to be more specific. A driver should * set the most specific hash possible. * * A driver cannot indicate a more specific hash than the layer at which a hash * was computed. For instance an L3 hash cannot be set as an L4 hash. * * A driver may indicate a hash level which is less specific than the * actual layer the hash was computed on. For instance, a hash computed * at L4 may be considered an L3 hash. This should only be done if the * driver can't unambiguously determine that the HW computed the hash at * the higher layer. Note that the "should" in the second property above * permits this. */ enum pkt_hash_types { PKT_HASH_TYPE_NONE, /* Undefined type */ PKT_HASH_TYPE_L2, /* Input: src_MAC, dest_MAC */ PKT_HASH_TYPE_L3, /* Input: src_IP, dst_IP */ PKT_HASH_TYPE_L4, /* Input: src_IP, dst_IP, src_port, dst_port */ }; static inline void skb_clear_hash(struct sk_buff *skb) { skb->hash = 0; skb->sw_hash = 0; skb->l4_hash = 0; } static inline void skb_clear_hash_if_not_l4(struct sk_buff *skb) { if (!skb->l4_hash) skb_clear_hash(skb); } static inline void __skb_set_hash(struct sk_buff *skb, __u32 hash, bool is_sw, bool is_l4) { skb->l4_hash = is_l4; skb->sw_hash = is_sw; skb->hash = hash; } static inline void skb_set_hash(struct sk_buff *skb, __u32 hash, enum pkt_hash_types type) { /* Used by drivers to set hash from HW */ __skb_set_hash(skb, hash, false, type == PKT_HASH_TYPE_L4); } static inline void __skb_set_sw_hash(struct sk_buff *skb, __u32 hash, bool is_l4) { __skb_set_hash(skb, hash, true, is_l4); } void __skb_get_hash(struct sk_buff *skb); u32 __skb_get_hash_symmetric(const struct sk_buff *skb); u32 skb_get_poff(const struct sk_buff *skb); u32 __skb_get_poff(const struct sk_buff *skb, const void *data, const struct flow_keys_basic *keys, int hlen); __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, const void *data, int hlen_proto); static inline __be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto) { return __skb_flow_get_ports(skb, thoff, ip_proto, NULL, 0); } void skb_flow_dissector_init(struct flow_dissector *flow_dissector, const struct flow_dissector_key *key, unsigned int key_count); struct bpf_flow_dissector; u32 bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, __be16 proto, int nhoff, int hlen, unsigned int flags); bool __skb_flow_dissect(const struct net *net, const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, __be16 proto, int nhoff, int hlen, unsigned int flags); static inline bool skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, unsigned int flags) { return __skb_flow_dissect(NULL, skb, flow_dissector, target_container, NULL, 0, 0, 0, flags); } static inline bool skb_flow_dissect_flow_keys(const struct sk_buff *skb, struct flow_keys *flow, unsigned int flags) { memset(flow, 0, sizeof(*flow)); return __skb_flow_dissect(NULL, skb, &flow_keys_dissector, flow, NULL, 0, 0, 0, flags); } static inline bool skb_flow_dissect_flow_keys_basic(const struct net *net, const struct sk_buff *skb, struct flow_keys_basic *flow, const void *data, __be16 proto, int nhoff, int hlen, unsigned int flags) { memset(flow, 0, sizeof(*flow)); return __skb_flow_dissect(net, skb, &flow_keys_basic_dissector, flow, data, proto, nhoff, hlen, flags); } void skb_flow_dissect_meta(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container); /* Gets a skb connection tracking info, ctinfo map should be a * map of mapsize to translate enum ip_conntrack_info states * to user states. */ void skb_flow_dissect_ct(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, u16 *ctinfo_map, size_t mapsize, bool post_ct, u16 zone); void skb_flow_dissect_tunnel_info(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container); void skb_flow_dissect_hash(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container); static inline __u32 skb_get_hash(struct sk_buff *skb) { if (!skb->l4_hash && !skb->sw_hash) __skb_get_hash(skb); return skb->hash; } static inline __u32 skb_get_hash_flowi6(struct sk_buff *skb, const struct flowi6 *fl6) { if (!skb->l4_hash && !skb->sw_hash) { struct flow_keys keys; __u32 hash = __get_hash_from_flowi6(fl6, &keys); __skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys)); } return skb->hash; } __u32 skb_get_hash_perturb(const struct sk_buff *skb, const siphash_key_t *perturb); static inline __u32 skb_get_hash_raw(const struct sk_buff *skb) { return skb->hash; } static inline void skb_copy_hash(struct sk_buff *to, const struct sk_buff *from) { to->hash = from->hash; to->sw_hash = from->sw_hash; to->l4_hash = from->l4_hash; }; static inline int skb_cmp_decrypted(const struct sk_buff *skb1, const struct sk_buff *skb2) { #ifdef CONFIG_TLS_DEVICE return skb2->decrypted - skb1->decrypted; #else return 0; #endif } static inline void skb_copy_decrypted(struct sk_buff *to, const struct sk_buff *from) { #ifdef CONFIG_TLS_DEVICE to->decrypted = from->decrypted; #endif } #ifdef NET_SKBUFF_DATA_USES_OFFSET static inline unsigned char *skb_end_pointer(const struct sk_buff *skb) { return skb->head + skb->end; } static inline unsigned int skb_end_offset(const struct sk_buff *skb) { return skb->end; } static inline void skb_set_end_offset(struct sk_buff *skb, unsigned int offset) { skb->end = offset; } #else static inline unsigned char *skb_end_pointer(const struct sk_buff *skb) { return skb->end; } static inline unsigned int skb_end_offset(const struct sk_buff *skb) { return skb->end - skb->head; } static inline void skb_set_end_offset(struct sk_buff *skb, unsigned int offset) { skb->end = skb->head + offset; } #endif struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, struct ubuf_info *uarg); void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref); void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg, bool success); int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, struct sk_buff *skb, struct iov_iter *from, size_t length); static inline int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len) { return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len); } int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, struct msghdr *msg, int len, struct ubuf_info *uarg); /* Internal */ #define skb_shinfo(SKB) ((struct skb_shared_info *)(skb_end_pointer(SKB))) static inline struct skb_shared_hwtstamps *skb_hwtstamps(struct sk_buff *skb) { return &skb_shinfo(skb)->hwtstamps; } static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb) { bool is_zcopy = skb && skb_shinfo(skb)->flags & SKBFL_ZEROCOPY_ENABLE; return is_zcopy ? skb_uarg(skb) : NULL; } static inline bool skb_zcopy_pure(const struct sk_buff *skb) { return skb_shinfo(skb)->flags & SKBFL_PURE_ZEROCOPY; } static inline bool skb_zcopy_managed(const struct sk_buff *skb) { return skb_shinfo(skb)->flags & SKBFL_MANAGED_FRAG_REFS; } static inline bool skb_pure_zcopy_same(const struct sk_buff *skb1, const struct sk_buff *skb2) { return skb_zcopy_pure(skb1) == skb_zcopy_pure(skb2); } static inline void net_zcopy_get(struct ubuf_info *uarg) { refcount_inc(&uarg->refcnt); } static inline void skb_zcopy_init(struct sk_buff *skb, struct ubuf_info *uarg) { skb_shinfo(skb)->destructor_arg = uarg; skb_shinfo(skb)->flags |= uarg->flags; } static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg, bool *have_ref) { if (skb && uarg && !skb_zcopy(skb)) { if (unlikely(have_ref && *have_ref)) *have_ref = false; else net_zcopy_get(uarg); skb_zcopy_init(skb, uarg); } } static inline void skb_zcopy_set_nouarg(struct sk_buff *skb, void *val) { skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t) val | 0x1UL); skb_shinfo(skb)->flags |= SKBFL_ZEROCOPY_FRAG; } static inline bool skb_zcopy_is_nouarg(struct sk_buff *skb) { return (uintptr_t) skb_shinfo(skb)->destructor_arg & 0x1UL; } static inline void *skb_zcopy_get_nouarg(struct sk_buff *skb) { return (void *)((uintptr_t) skb_shinfo(skb)->destructor_arg & ~0x1UL); } static inline void net_zcopy_put(struct ubuf_info *uarg) { if (uarg) uarg->callback(NULL, uarg, true); } static inline void net_zcopy_put_abort(struct ubuf_info *uarg, bool have_uref) { if (uarg) { if (uarg->callback == msg_zerocopy_callback) msg_zerocopy_put_abort(uarg, have_uref); else if (have_uref) net_zcopy_put(uarg); } } /* Release a reference on a zerocopy structure */ static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy_success) { struct ubuf_info *uarg = skb_zcopy(skb); if (uarg) { if (!skb_zcopy_is_nouarg(skb)) uarg->callback(skb, uarg, zerocopy_success); skb_shinfo(skb)->flags &= ~SKBFL_ALL_ZEROCOPY; } } void __skb_zcopy_downgrade_managed(struct sk_buff *skb); static inline void skb_zcopy_downgrade_managed(struct sk_buff *skb) { if (unlikely(skb_zcopy_managed(skb))) __skb_zcopy_downgrade_managed(skb); } static inline void skb_mark_not_on_list(struct sk_buff *skb) { skb->next = NULL; } static inline void skb_poison_list(struct sk_buff *skb) { #ifdef CONFIG_DEBUG_NET skb->next = SKB_LIST_POISON_NEXT; #endif } /* Iterate through singly-linked GSO fragments of an skb. */ #define skb_list_walk_safe(first, skb, next_skb) \ for ((skb) = (first), (next_skb) = (skb) ? (skb)->next : NULL; (skb); \ (skb) = (next_skb), (next_skb) = (skb) ? (skb)->next : NULL) static inline void skb_list_del_init(struct sk_buff *skb) { __list_del_entry(&skb->list); skb_mark_not_on_list(skb); } /** * skb_queue_empty - check if a queue is empty * @list: queue head * * Returns true if the queue is empty, false otherwise. */ static inline int skb_queue_empty(const struct sk_buff_head *list) { return list->next == (const struct sk_buff *) list; } /** * skb_queue_empty_lockless - check if a queue is empty * @list: queue head * * Returns true if the queue is empty, false otherwise. * This variant can be used in lockless contexts. */ static inline bool skb_queue_empty_lockless(const struct sk_buff_head *list) { return READ_ONCE(list->next) == (const struct sk_buff *) list; } /** * skb_queue_is_last - check if skb is the last entry in the queue * @list: queue head * @skb: buffer * * Returns true if @skb is the last buffer on the list. */ static inline bool skb_queue_is_last(const struct sk_buff_head *list, const struct sk_buff *skb) { return skb->next == (const struct sk_buff *) list; } /** * skb_queue_is_first - check if skb is the first entry in the queue * @list: queue head * @skb: buffer * * Returns true if @skb is the first buffer on the list. */ static inline bool skb_queue_is_first(const struct sk_buff_head *list, const struct sk_buff *skb) { return skb->prev == (const struct sk_buff *) list; } /** * skb_queue_next - return the next packet in the queue * @list: queue head * @skb: current buffer * * Return the next packet in @list after @skb. It is only valid to * call this if skb_queue_is_last() evaluates to false. */ static inline struct sk_buff *skb_queue_next(const struct sk_buff_head *list, const struct sk_buff *skb) { /* This BUG_ON may seem severe, but if we just return then we * are going to dereference garbage. */ BUG_ON(skb_queue_is_last(list, skb)); return skb->next; } /** * skb_queue_prev - return the prev packet in the queue * @list: queue head * @skb: current buffer * * Return the prev packet in @list before @skb. It is only valid to * call this if skb_queue_is_first() evaluates to false. */ static inline struct sk_buff *skb_queue_prev(const struct sk_buff_head *list, const struct sk_buff *skb) { /* This BUG_ON may seem severe, but if we just return then we * are going to dereference garbage. */ BUG_ON(skb_queue_is_first(list, skb)); return skb->prev; } /** * skb_get - reference buffer * @skb: buffer to reference * * Makes another reference to a socket buffer and returns a pointer * to the buffer. */ static inline struct sk_buff *skb_get(struct sk_buff *skb) { refcount_inc(&skb->users); return skb; } /* * If users == 1, we are the only owner and can avoid redundant atomic changes. */ /** * skb_cloned - is the buffer a clone * @skb: buffer to check * * Returns true if the buffer was generated with skb_clone() and is * one of multiple shared copies of the buffer. Cloned buffers are * shared data so must not be written to under normal circumstances. */ static inline int skb_cloned(const struct sk_buff *skb) { return skb->cloned && (atomic_read(&skb_shinfo(skb)->dataref) & SKB_DATAREF_MASK) != 1; } static inline int skb_unclone(struct sk_buff *skb, gfp_t pri) { might_sleep_if(gfpflags_allow_blocking(pri)); if (skb_cloned(skb)) return pskb_expand_head(skb, 0, 0, pri); return 0; } /* This variant of skb_unclone() makes sure skb->truesize * and skb_end_offset() are not changed, whenever a new skb->head is needed. * * Indeed there is no guarantee that ksize(kmalloc(X)) == ksize(kmalloc(X)) * when various debugging features are in place. */ int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri); static inline int skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri) { might_sleep_if(gfpflags_allow_blocking(pri)); if (skb_cloned(skb)) return __skb_unclone_keeptruesize(skb, pri); return 0; } /** * skb_header_cloned - is the header a clone * @skb: buffer to check * * Returns true if modifying the header part of the buffer requires * the data to be copied. */ static inline int skb_header_cloned(const struct sk_buff *skb) { int dataref; if (!skb->cloned) return 0; dataref = atomic_read(&skb_shinfo(skb)->dataref); dataref = (dataref & SKB_DATAREF_MASK) - (dataref >> SKB_DATAREF_SHIFT); return dataref != 1; } static inline int skb_header_unclone(struct sk_buff *skb, gfp_t pri) { might_sleep_if(gfpflags_allow_blocking(pri)); if (skb_header_cloned(skb)) return pskb_expand_head(skb, 0, 0, pri); return 0; } /** * __skb_header_release() - allow clones to use the headroom * @skb: buffer to operate on * * See "DOC: dataref and headerless skbs". */ static inline void __skb_header_release(struct sk_buff *skb) { skb->nohdr = 1; atomic_set(&skb_shinfo(skb)->dataref, 1 + (1 << SKB_DATAREF_SHIFT)); } /** * skb_shared - is the buffer shared * @skb: buffer to check * * Returns true if more than one person has a reference to this * buffer. */ static inline int skb_shared(const struct sk_buff *skb) { return refcount_read(&skb->users) != 1; } /** * skb_share_check - check if buffer is shared and if so clone it * @skb: buffer to check * @pri: priority for memory allocation * * If the buffer is shared the buffer is cloned and the old copy * drops a reference. A new clone with a single reference is returned. * If the buffer is not shared the original buffer is returned. When * being called from interrupt status or with spinlocks held pri must * be GFP_ATOMIC. * * NULL is returned on a memory allocation failure. */ static inline struct sk_buff *skb_share_check(struct sk_buff *skb, gfp_t pri) { might_sleep_if(gfpflags_allow_blocking(pri)); if (skb_shared(skb)) { struct sk_buff *nskb = skb_clone(skb, pri); if (likely(nskb)) consume_skb(skb); else kfree_skb(skb); skb = nskb; } return skb; } /* * Copy shared buffers into a new sk_buff. We effectively do COW on * packets to handle cases where we have a local reader and forward * and a couple of other messy ones. The normal one is tcpdumping * a packet that's being forwarded. */ /** * skb_unshare - make a copy of a shared buffer * @skb: buffer to check * @pri: priority for memory allocation * * If the socket buffer is a clone then this function creates a new * copy of the data, drops a reference count on the old copy and returns * the new copy with the reference count at 1. If the buffer is not a clone * the original buffer is returned. When called with a spinlock held or * from interrupt state @pri must be %GFP_ATOMIC * * %NULL is returned on a memory allocation failure. */ static inline struct sk_buff *skb_unshare(struct sk_buff *skb, gfp_t pri) { might_sleep_if(gfpflags_allow_blocking(pri)); if (skb_cloned(skb)) { struct sk_buff *nskb = skb_copy(skb, pri); /* Free our shared copy */ if (likely(nskb)) consume_skb(skb); else kfree_skb(skb); skb = nskb; } return skb; } /** * skb_peek - peek at the head of an &sk_buff_head * @list_: list to peek at * * Peek an &sk_buff. Unlike most other operations you _MUST_ * be careful with this one. A peek leaves the buffer on the * list and someone else may run off with it. You must hold * the appropriate locks or have a private queue to do this. * * Returns %NULL for an empty list or a pointer to the head element. * The reference count is not incremented and the reference is therefore * volatile. Use with caution. */ static inline struct sk_buff *skb_peek(const struct sk_buff_head *list_) { struct sk_buff *skb = list_->next; if (skb == (struct sk_buff *)list_) skb = NULL; return skb; } /** * __skb_peek - peek at the head of a non-empty &sk_buff_head * @list_: list to peek at * * Like skb_peek(), but the caller knows that the list is not empty. */ static inline struct sk_buff *__skb_peek(const struct sk_buff_head *list_) { return list_->next; } /** * skb_peek_next - peek skb following the given one from a queue * @skb: skb to start from * @list_: list to peek at * * Returns %NULL when the end of the list is met or a pointer to the * next element. The reference count is not incremented and the * reference is therefore volatile. Use with caution. */ static inline struct sk_buff *skb_peek_next(struct sk_buff *skb, const struct sk_buff_head *list_) { struct sk_buff *next = skb->next; if (next == (struct sk_buff *)list_) next = NULL; return next; } /** * skb_peek_tail - peek at the tail of an &sk_buff_head * @list_: list to peek at * * Peek an &sk_buff. Unlike most other operations you _MUST_ * be careful with this one. A peek leaves the buffer on the * list and someone else may run off with it. You must hold * the appropriate locks or have a private queue to do this. * * Returns %NULL for an empty list or a pointer to the tail element. * The reference count is not incremented and the reference is therefore * volatile. Use with caution. */ static inline struct sk_buff *skb_peek_tail(const struct sk_buff_head *list_) { struct sk_buff *skb = READ_ONCE(list_->prev); if (skb == (struct sk_buff *)list_) skb = NULL; return skb; } /** * skb_queue_len - get queue length * @list_: list to measure * * Return the length of an &sk_buff queue. */ static inline __u32 skb_queue_len(const struct sk_buff_head *list_) { return list_->qlen; } /** * skb_queue_len_lockless - get queue length * @list_: list to measure * * Return the length of an &sk_buff queue. * This variant can be used in lockless contexts. */ static inline __u32 skb_queue_len_lockless(const struct sk_buff_head *list_) { return READ_ONCE(list_->qlen); } /** * __skb_queue_head_init - initialize non-spinlock portions of sk_buff_head * @list: queue to initialize * * This initializes only the list and queue length aspects of * an sk_buff_head object. This allows to initialize the list * aspects of an sk_buff_head without reinitializing things like * the spinlock. It can also be used for on-stack sk_buff_head * objects where the spinlock is known to not be used. */ static inline void __skb_queue_head_init(struct sk_buff_head *list) { list->prev = list->next = (struct sk_buff *)list; list->qlen = 0; } /* * This function creates a split out lock class for each invocation; * this is needed for now since a whole lot of users of the skb-queue * infrastructure in drivers have different locking usage (in hardirq) * than the networking core (in softirq only). In the long run either the * network layer or drivers should need annotation to consolidate the * main types of usage into 3 classes. */ static inline void skb_queue_head_init(struct sk_buff_head *list) { spin_lock_init(&list->lock); __skb_queue_head_init(list); } static inline void skb_queue_head_init_class(struct sk_buff_head *list, struct lock_class_key *class) { skb_queue_head_init(list); lockdep_set_class(&list->lock, class); } /* * Insert an sk_buff on a list. * * The "__skb_xxxx()" functions are the non-atomic ones that * can only be called with interrupts disabled. */ static inline void __skb_insert(struct sk_buff *newsk, struct sk_buff *prev, struct sk_buff *next, struct sk_buff_head *list) { /* See skb_queue_empty_lockless() and skb_peek_tail() * for the opposite READ_ONCE() */ WRITE_ONCE(newsk->next, next); WRITE_ONCE(newsk->prev, prev); WRITE_ONCE(((struct sk_buff_list *)next)->prev, newsk); WRITE_ONCE(((struct sk_buff_list *)prev)->next, newsk); WRITE_ONCE(list->qlen, list->qlen + 1); } static inline void __skb_queue_splice(const struct sk_buff_head *list, struct sk_buff *prev, struct sk_buff *next) { struct sk_buff *first = list->next; struct sk_buff *last = list->prev; WRITE_ONCE(first->prev, prev); WRITE_ONCE(prev->next, first); WRITE_ONCE(last->next, next); WRITE_ONCE(next->prev, last); } /** * skb_queue_splice - join two skb lists, this is designed for stacks * @list: the new list to add * @head: the place to add it in the first list */ static inline void skb_queue_splice(const struct sk_buff_head *list, struct sk_buff_head *head) { if (!skb_queue_empty(list)) { __skb_queue_splice(list, (struct sk_buff *) head, head->next); head->qlen += list->qlen; } } /** * skb_queue_splice_init - join two skb lists and reinitialise the emptied list * @list: the new list to add * @head: the place to add it in the first list * * The list at @list is reinitialised */ static inline void skb_queue_splice_init(struct sk_buff_head *list, struct sk_buff_head *head) { if (!skb_queue_empty(list)) { __skb_queue_splice(list, (struct sk_buff *) head, head->next); head->qlen += list->qlen; __skb_queue_head_init(list); } } /** * skb_queue_splice_tail - join two skb lists, each list being a queue * @list: the new list to add * @head: the place to add it in the first list */ static inline void skb_queue_splice_tail(const struct sk_buff_head *list, struct sk_buff_head *head) { if (!skb_queue_empty(list)) { __skb_queue_splice(list, head->prev, (struct sk_buff *) head); head->qlen += list->qlen; } } /** * skb_queue_splice_tail_init - join two skb lists and reinitialise the emptied list * @list: the new list to add * @head: the place to add it in the first list * * Each of the lists is a queue. * The list at @list is reinitialised */ static inline void skb_queue_splice_tail_init(struct sk_buff_head *list, struct sk_buff_head *head) { if (!skb_queue_empty(list)) { __skb_queue_splice(list, head->prev, (struct sk_buff *) head); head->qlen += list->qlen; __skb_queue_head_init(list); } } /** * __skb_queue_after - queue a buffer at the list head * @list: list to use * @prev: place after this buffer * @newsk: buffer to queue * * Queue a buffer int the middle of a list. This function takes no locks * and you must therefore hold required locks before calling it. * * A buffer cannot be placed on two lists at the same time. */ static inline void __skb_queue_after(struct sk_buff_head *list, struct sk_buff *prev, struct sk_buff *newsk) { __skb_insert(newsk, prev, ((struct sk_buff_list *)prev)->next, list); } void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list); static inline void __skb_queue_before(struct sk_buff_head *list, struct sk_buff *next, struct sk_buff *newsk) { __skb_insert(newsk, ((struct sk_buff_list *)next)->prev, next, list); } /** * __skb_queue_head - queue a buffer at the list head * @list: list to use * @newsk: buffer to queue * * Queue a buffer at the start of a list. This function takes no locks * and you must therefore hold required locks before calling it. * * A buffer cannot be placed on two lists at the same time. */ static inline void __skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk) { __skb_queue_after(list, (struct sk_buff *)list, newsk); } void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk); /** * __skb_queue_tail - queue a buffer at the list tail * @list: list to use * @newsk: buffer to queue * * Queue a buffer at the end of a list. This function takes no locks * and you must therefore hold required locks before calling it. * * A buffer cannot be placed on two lists at the same time. */ static inline void __skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk) { __skb_queue_before(list, (struct sk_buff *)list, newsk); } void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk); /* * remove sk_buff from list. _Must_ be called atomically, and with * the list known.. */ void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list); static inline void __skb_unlink(struct sk_buff *skb, struct sk_buff_head *list) { struct sk_buff *next, *prev; WRITE_ONCE(list->qlen, list->qlen - 1); next = skb->next; prev = skb->prev; skb->next = skb->prev = NULL; WRITE_ONCE(next->prev, prev); WRITE_ONCE(prev->next, next); } /** * __skb_dequeue - remove from the head of the queue * @list: list to dequeue from * * Remove the head of the list. This function does not take any locks * so must be used with appropriate locks held only. The head item is * returned or %NULL if the list is empty. */ static inline struct sk_buff *__skb_dequeue(struct sk_buff_head *list) { struct sk_buff *skb = skb_peek(list); if (skb) __skb_unlink(skb, list); return skb; } struct sk_buff *skb_dequeue(struct sk_buff_head *list); /** * __skb_dequeue_tail - remove from the tail of the queue * @list: list to dequeue from * * Remove the tail of the list. This function does not take any locks * so must be used with appropriate locks held only. The tail item is * returned or %NULL if the list is empty. */ static inline struct sk_buff *__skb_dequeue_tail(struct sk_buff_head *list) { struct sk_buff *skb = skb_peek_tail(list); if (skb) __skb_unlink(skb, list); return skb; } struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list); static inline bool skb_is_nonlinear(const struct sk_buff *skb) { return skb->data_len; } static inline unsigned int skb_headlen(const struct sk_buff *skb) { return skb->len - skb->data_len; } static inline unsigned int __skb_pagelen(const struct sk_buff *skb) { unsigned int i, len = 0; for (i = skb_shinfo(skb)->nr_frags - 1; (int)i >= 0; i--) len += skb_frag_size(&skb_shinfo(skb)->frags[i]); return len; } static inline unsigned int skb_pagelen(const struct sk_buff *skb) { return skb_headlen(skb) + __skb_pagelen(skb); } static inline void skb_frag_fill_page_desc(skb_frag_t *frag, struct page *page, int off, int size) { frag->bv_page = page; frag->bv_offset = off; skb_frag_size_set(frag, size); } static inline void __skb_fill_page_desc_noacc(struct skb_shared_info *shinfo, int i, struct page *page, int off, int size) { skb_frag_t *frag = &shinfo->frags[i]; skb_frag_fill_page_desc(frag, page, off, size); } /** * skb_len_add - adds a number to len fields of skb * @skb: buffer to add len to * @delta: number of bytes to add */ static inline void skb_len_add(struct sk_buff *skb, int delta) { skb->len += delta; skb->data_len += delta; skb->truesize += delta; } /** * __skb_fill_page_desc - initialise a paged fragment in an skb * @skb: buffer containing fragment to be initialised * @i: paged fragment index to initialise * @page: the page to use for this fragment * @off: the offset to the data with @page * @size: the length of the data * * Initialises the @i'th fragment of @skb to point to &size bytes at * offset @off within @page. * * Does not take any additional reference on the fragment. */ static inline void __skb_fill_page_desc(struct sk_buff *skb, int i, struct page *page, int off, int size) { __skb_fill_page_desc_noacc(skb_shinfo(skb), i, page, off, size); /* Propagate page pfmemalloc to the skb if we can. The problem is * that not all callers have unique ownership of the page but rely * on page_is_pfmemalloc doing the right thing(tm). */ page = compound_head(page); if (page_is_pfmemalloc(page)) skb->pfmemalloc = true; } /** * skb_fill_page_desc - initialise a paged fragment in an skb * @skb: buffer containing fragment to be initialised * @i: paged fragment index to initialise * @page: the page to use for this fragment * @off: the offset to the data with @page * @size: the length of the data * * As per __skb_fill_page_desc() -- initialises the @i'th fragment of * @skb to point to @size bytes at offset @off within @page. In * addition updates @skb such that @i is the last fragment. * * Does not take any additional reference on the fragment. */ static inline void skb_fill_page_desc(struct sk_buff *skb, int i, struct page *page, int off, int size) { __skb_fill_page_desc(skb, i, page, off, size); skb_shinfo(skb)->nr_frags = i + 1; } /** * skb_fill_page_desc_noacc - initialise a paged fragment in an skb * @skb: buffer containing fragment to be initialised * @i: paged fragment index to initialise * @page: the page to use for this fragment * @off: the offset to the data with @page * @size: the length of the data * * Variant of skb_fill_page_desc() which does not deal with * pfmemalloc, if page is not owned by us. */ static inline void skb_fill_page_desc_noacc(struct sk_buff *skb, int i, struct page *page, int off, int size) { struct skb_shared_info *shinfo = skb_shinfo(skb); __skb_fill_page_desc_noacc(shinfo, i, page, off, size); shinfo->nr_frags = i + 1; } void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, int size, unsigned int truesize); void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size, unsigned int truesize); #define SKB_LINEAR_ASSERT(skb) BUG_ON(skb_is_nonlinear(skb)) #ifdef NET_SKBUFF_DATA_USES_OFFSET static inline unsigned char *skb_tail_pointer(const struct sk_buff *skb) { return skb->head + skb->tail; } static inline void skb_reset_tail_pointer(struct sk_buff *skb) { skb->tail = skb->data - skb->head; } static inline void skb_set_tail_pointer(struct sk_buff *skb, const int offset) { skb_reset_tail_pointer(skb); skb->tail += offset; } #else /* NET_SKBUFF_DATA_USES_OFFSET */ static inline unsigned char *skb_tail_pointer(const struct sk_buff *skb) { return skb->tail; } static inline void skb_reset_tail_pointer(struct sk_buff *skb) { skb->tail = skb->data; } static inline void skb_set_tail_pointer(struct sk_buff *skb, const int offset) { skb->tail = skb->data + offset; } #endif /* NET_SKBUFF_DATA_USES_OFFSET */ static inline void skb_assert_len(struct sk_buff *skb) { #ifdef CONFIG_DEBUG_NET if (WARN_ONCE(!skb->len, "%s\n", __func__)) DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false); #endif /* CONFIG_DEBUG_NET */ } /* * Add data to an sk_buff */ void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len); void *skb_put(struct sk_buff *skb, unsigned int len); static inline void *__skb_put(struct sk_buff *skb, unsigned int len) { void *tmp = skb_tail_pointer(skb); SKB_LINEAR_ASSERT(skb); skb->tail += len; skb->len += len; return tmp; } static inline void *__skb_put_zero(struct sk_buff *skb, unsigned int len) { void *tmp = __skb_put(skb, len); memset(tmp, 0, len); return tmp; } static inline void *__skb_put_data(struct sk_buff *skb, const void *data, unsigned int len) { void *tmp = __skb_put(skb, len); memcpy(tmp, data, len); return tmp; } static inline void __skb_put_u8(struct sk_buff *skb, u8 val) { *(u8 *)__skb_put(skb, 1) = val; } static inline void *skb_put_zero(struct sk_buff *skb, unsigned int len) { void *tmp = skb_put(skb, len); memset(tmp, 0, len); return tmp; } static inline void *skb_put_data(struct sk_buff *skb, const void *data, unsigned int len) { void *tmp = skb_put(skb, len); memcpy(tmp, data, len); return tmp; } static inline void skb_put_u8(struct sk_buff *skb, u8 val) { *(u8 *)skb_put(skb, 1) = val; } void *skb_push(struct sk_buff *skb, unsigned int len); static inline void *__skb_push(struct sk_buff *skb, unsigned int len) { skb->data -= len; skb->len += len; return skb->data; } void *skb_pull(struct sk_buff *skb, unsigned int len); static inline void *__skb_pull(struct sk_buff *skb, unsigned int len) { skb->len -= len; if (unlikely(skb->len < skb->data_len)) { #if defined(CONFIG_DEBUG_NET) skb->len += len; pr_err("__skb_pull(len=%u)\n", len); skb_dump(KERN_ERR, skb, false); #endif BUG(); } return skb->data += len; } static inline void *skb_pull_inline(struct sk_buff *skb, unsigned int len) { return unlikely(len > skb->len) ? NULL : __skb_pull(skb, len); } void *skb_pull_data(struct sk_buff *skb, size_t len); void *__pskb_pull_tail(struct sk_buff *skb, int delta); static inline enum skb_drop_reason pskb_may_pull_reason(struct sk_buff *skb, unsigned int len) { if (likely(len <= skb_headlen(skb))) return SKB_NOT_DROPPED_YET; if (unlikely(len > skb->len)) return SKB_DROP_REASON_PKT_TOO_SMALL; if (unlikely(!__pskb_pull_tail(skb, len - skb_headlen(skb)))) return SKB_DROP_REASON_NOMEM; return SKB_NOT_DROPPED_YET; } static inline bool pskb_may_pull(struct sk_buff *skb, unsigned int len) { return pskb_may_pull_reason(skb, len) == SKB_NOT_DROPPED_YET; } static inline void *pskb_pull(struct sk_buff *skb, unsigned int len) { if (!pskb_may_pull(skb, len)) return NULL; skb->len -= len; return skb->data += len; } void skb_condense(struct sk_buff *skb); /** * skb_headroom - bytes at buffer head * @skb: buffer to check * * Return the number of bytes of free space at the head of an &sk_buff. */ static inline unsigned int skb_headroom(const struct sk_buff *skb) { return skb->data - skb->head; } /** * skb_tailroom - bytes at buffer end * @skb: buffer to check * * Return the number of bytes of free space at the tail of an sk_buff */ static inline int skb_tailroom(const struct sk_buff *skb) { return skb_is_nonlinear(skb) ? 0 : skb->end - skb->tail; } /** * skb_availroom - bytes at buffer end * @skb: buffer to check * * Return the number of bytes of free space at the tail of an sk_buff * allocated by sk_stream_alloc() */ static inline int skb_availroom(const struct sk_buff *skb) { if (skb_is_nonlinear(skb)) return 0; return skb->end - skb->tail - skb->reserved_tailroom; } /** * skb_reserve - adjust headroom * @skb: buffer to alter * @len: bytes to move * * Increase the headroom of an empty &sk_buff by reducing the tail * room. This is only allowed for an empty buffer. */ static inline void skb_reserve(struct sk_buff *skb, int len) { skb->data += len; skb->tail += len; } /** * skb_tailroom_reserve - adjust reserved_tailroom * @skb: buffer to alter * @mtu: maximum amount of headlen permitted * @needed_tailroom: minimum amount of reserved_tailroom * * Set reserved_tailroom so that headlen can be as large as possible but * not larger than mtu and tailroom cannot be smaller than * needed_tailroom. * The required headroom should already have been reserved before using * this function. */ static inline void skb_tailroom_reserve(struct sk_buff *skb, unsigned int mtu, unsigned int needed_tailroom) { SKB_LINEAR_ASSERT(skb); if (mtu < skb_tailroom(skb) - needed_tailroom) /* use at most mtu */ skb->reserved_tailroom = skb_tailroom(skb) - mtu; else /* use up to all available space */ skb->reserved_tailroom = needed_tailroom; } #define ENCAP_TYPE_ETHER 0 #define ENCAP_TYPE_IPPROTO 1 static inline void skb_set_inner_protocol(struct sk_buff *skb, __be16 protocol) { skb->inner_protocol = protocol; skb->inner_protocol_type = ENCAP_TYPE_ETHER; } static inline void skb_set_inner_ipproto(struct sk_buff *skb, __u8 ipproto) { skb->inner_ipproto = ipproto; skb->inner_protocol_type = ENCAP_TYPE_IPPROTO; } static inline void skb_reset_inner_headers(struct sk_buff *skb) { skb->inner_mac_header = skb->mac_header; skb->inner_network_header = skb->network_header; skb->inner_transport_header = skb->transport_header; } static inline void skb_reset_mac_len(struct sk_buff *skb) { skb->mac_len = skb->network_header - skb->mac_header; } static inline unsigned char *skb_inner_transport_header(const struct sk_buff *skb) { return skb->head + skb->inner_transport_header; } static inline int skb_inner_transport_offset(const struct sk_buff *skb) { return skb_inner_transport_header(skb) - skb->data; } static inline void skb_reset_inner_transport_header(struct sk_buff *skb) { skb->inner_transport_header = skb->data - skb->head; } static inline void skb_set_inner_transport_header(struct sk_buff *skb, const int offset) { skb_reset_inner_transport_header(skb); skb->inner_transport_header += offset; } static inline unsigned char *skb_inner_network_header(const struct sk_buff *skb) { return skb->head + skb->inner_network_header; } static inline void skb_reset_inner_network_header(struct sk_buff *skb) { skb->inner_network_header = skb->data - skb->head; } static inline void skb_set_inner_network_header(struct sk_buff *skb, const int offset) { skb_reset_inner_network_header(skb); skb->inner_network_header += offset; } static inline unsigned char *skb_inner_mac_header(const struct sk_buff *skb) { return skb->head + skb->inner_mac_header; } static inline void skb_reset_inner_mac_header(struct sk_buff *skb) { skb->inner_mac_header = skb->data - skb->head; } static inline void skb_set_inner_mac_header(struct sk_buff *skb, const int offset) { skb_reset_inner_mac_header(skb); skb->inner_mac_header += offset; } static inline bool skb_transport_header_was_set(const struct sk_buff *skb) { return skb->transport_header != (typeof(skb->transport_header))~0U; } static inline unsigned char *skb_transport_header(const struct sk_buff *skb) { DEBUG_NET_WARN_ON_ONCE(!skb_transport_header_was_set(skb)); return skb->head + skb->transport_header; } static inline void skb_reset_transport_header(struct sk_buff *skb) { skb->transport_header = skb->data - skb->head; } static inline void skb_set_transport_header(struct sk_buff *skb, const int offset) { skb_reset_transport_header(skb); skb->transport_header += offset; } static inline unsigned char *skb_network_header(const struct sk_buff *skb) { return skb->head + skb->network_header; } static inline void skb_reset_network_header(struct sk_buff *skb) { skb->network_header = skb->data - skb->head; } static inline void skb_set_network_header(struct sk_buff *skb, const int offset) { skb_reset_network_header(skb); skb->network_header += offset; } static inline int skb_mac_header_was_set(const struct sk_buff *skb) { return skb->mac_header != (typeof(skb->mac_header))~0U; } static inline unsigned char *skb_mac_header(const struct sk_buff *skb) { DEBUG_NET_WARN_ON_ONCE(!skb_mac_header_was_set(skb)); return skb->head + skb->mac_header; } static inline int skb_mac_offset(const struct sk_buff *skb) { return skb_mac_header(skb) - skb->data; } static inline u32 skb_mac_header_len(const struct sk_buff *skb) { DEBUG_NET_WARN_ON_ONCE(!skb_mac_header_was_set(skb)); return skb->network_header - skb->mac_header; } static inline void skb_unset_mac_header(struct sk_buff *skb) { skb->mac_header = (typeof(skb->mac_header))~0U; } static inline void skb_reset_mac_header(struct sk_buff *skb) { skb->mac_header = skb->data - skb->head; } static inline void skb_set_mac_header(struct sk_buff *skb, const int offset) { skb_reset_mac_header(skb); skb->mac_header += offset; } static inline void skb_pop_mac_header(struct sk_buff *skb) { skb->mac_header = skb->network_header; } static inline void skb_probe_transport_header(struct sk_buff *skb) { struct flow_keys_basic keys; if (skb_transport_header_was_set(skb)) return; if (skb_flow_dissect_flow_keys_basic(NULL, skb, &keys, NULL, 0, 0, 0, 0)) skb_set_transport_header(skb, keys.control.thoff); } static inline void skb_mac_header_rebuild(struct sk_buff *skb) { if (skb_mac_header_was_set(skb)) { const unsigned char *old_mac = skb_mac_header(skb); skb_set_mac_header(skb, -skb->mac_len); memmove(skb_mac_header(skb), old_mac, skb->mac_len); } } static inline int skb_checksum_start_offset(const struct sk_buff *skb) { return skb->csum_start - skb_headroom(skb); } static inline unsigned char *skb_checksum_start(const struct sk_buff *skb) { return skb->head + skb->csum_start; } static inline int skb_transport_offset(const struct sk_buff *skb) { return skb_transport_header(skb) - skb->data; } static inline u32 skb_network_header_len(const struct sk_buff *skb) { return skb->transport_header - skb->network_header; } static inline u32 skb_inner_network_header_len(const struct sk_buff *skb) { return skb->inner_transport_header - skb->inner_network_header; } static inline int skb_network_offset(const struct sk_buff *skb) { return skb_network_header(skb) - skb->data; } static inline int skb_inner_network_offset(const struct sk_buff *skb) { return skb_inner_network_header(skb) - skb->data; } static inline int pskb_network_may_pull(struct sk_buff *skb, unsigned int len) { return pskb_may_pull(skb, skb_network_offset(skb) + len); } /* * CPUs often take a performance hit when accessing unaligned memory * locations. The actual performance hit varies, it can be small if the * hardware handles it or large if we have to take an exception and fix it * in software. * * Since an ethernet header is 14 bytes network drivers often end up with * the IP header at an unaligned offset. The IP header can be aligned by * shifting the start of the packet by 2 bytes. Drivers should do this * with: * * skb_reserve(skb, NET_IP_ALIGN); * * The downside to this alignment of the IP header is that the DMA is now * unaligned. On some architectures the cost of an unaligned DMA is high * and this cost outweighs the gains made by aligning the IP header. * * Since this trade off varies between architectures, we allow NET_IP_ALIGN * to be overridden. */ #ifndef NET_IP_ALIGN #define NET_IP_ALIGN 2 #endif /* * The networking layer reserves some headroom in skb data (via * dev_alloc_skb). This is used to avoid having to reallocate skb data when * the header has to grow. In the default case, if the header has to grow * 32 bytes or less we avoid the reallocation. * * Unfortunately this headroom changes the DMA alignment of the resulting * network packet. As for NET_IP_ALIGN, this unaligned DMA is expensive * on some architectures. An architecture can override this value, * perhaps setting it to a cacheline in size (since that will maintain * cacheline alignment of the DMA). It must be a power of 2. * * Various parts of the networking layer expect at least 32 bytes of * headroom, you should not reduce this. * * Using max(32, L1_CACHE_BYTES) makes sense (especially with RPS) * to reduce average number of cache lines per packet. * get_rps_cpu() for example only access one 64 bytes aligned block : * NET_IP_ALIGN(2) + ethernet_header(14) + IP_header(20/40) + ports(8) */ #ifndef NET_SKB_PAD #define NET_SKB_PAD max(32, L1_CACHE_BYTES) #endif int ___pskb_trim(struct sk_buff *skb, unsigned int len); static inline void __skb_set_length(struct sk_buff *skb, unsigned int len) { if (WARN_ON(skb_is_nonlinear(skb))) return; skb->len = len; skb_set_tail_pointer(skb, len); } static inline void __skb_trim(struct sk_buff *skb, unsigned int len) { __skb_set_length(skb, len); } void skb_trim(struct sk_buff *skb, unsigned int len); static inline int __pskb_trim(struct sk_buff *skb, unsigned int len) { if (skb->data_len) return ___pskb_trim(skb, len); __skb_trim(skb, len); return 0; } static inline int pskb_trim(struct sk_buff *skb, unsigned int len) { return (len < skb->len) ? __pskb_trim(skb, len) : 0; } /** * pskb_trim_unique - remove end from a paged unique (not cloned) buffer * @skb: buffer to alter * @len: new length * * This is identical to pskb_trim except that the caller knows that * the skb is not cloned so we should never get an error due to out- * of-memory. */ static inline void pskb_trim_unique(struct sk_buff *skb, unsigned int len) { int err = pskb_trim(skb, len); BUG_ON(err); } static inline int __skb_grow(struct sk_buff *skb, unsigned int len) { unsigned int diff = len - skb->len; if (skb_tailroom(skb) < diff) { int ret = pskb_expand_head(skb, 0, diff - skb_tailroom(skb), GFP_ATOMIC); if (ret) return ret; } __skb_set_length(skb, len); return 0; } /** * skb_orphan - orphan a buffer * @skb: buffer to orphan * * If a buffer currently has an owner then we call the owner's * destructor function and make the @skb unowned. The buffer continues * to exist but is no longer charged to its former owner. */ static inline void skb_orphan(struct sk_buff *skb) { if (skb->destructor) { skb->destructor(skb); skb->destructor = NULL; skb->sk = NULL; } else { BUG_ON(skb->sk); } } /** * skb_orphan_frags - orphan the frags contained in a buffer * @skb: buffer to orphan frags from * @gfp_mask: allocation mask for replacement pages * * For each frag in the SKB which needs a destructor (i.e. has an * owner) create a copy of that frag and release the original * page by calling the destructor. */ static inline int skb_orphan_frags(struct sk_buff *skb, gfp_t gfp_mask) { if (likely(!skb_zcopy(skb))) return 0; if (skb_shinfo(skb)->flags & SKBFL_DONT_ORPHAN) return 0; return skb_copy_ubufs(skb, gfp_mask); } /* Frags must be orphaned, even if refcounted, if skb might loop to rx path */ static inline int skb_orphan_frags_rx(struct sk_buff *skb, gfp_t gfp_mask) { if (likely(!skb_zcopy(skb))) return 0; return skb_copy_ubufs(skb, gfp_mask); } /** * __skb_queue_purge_reason - empty a list * @list: list to empty * @reason: drop reason * * Delete all buffers on an &sk_buff list. Each buffer is removed from * the list and one reference dropped. This function does not take the * list lock and the caller must hold the relevant locks to use it. */ static inline void __skb_queue_purge_reason(struct sk_buff_head *list, enum skb_drop_reason reason) { struct sk_buff *skb; while ((skb = __skb_dequeue(list)) != NULL) kfree_skb_reason(skb, reason); } static inline void __skb_queue_purge(struct sk_buff_head *list) { __skb_queue_purge_reason(list, SKB_DROP_REASON_QUEUE_PURGE); } void skb_queue_purge_reason(struct sk_buff_head *list, enum skb_drop_reason reason); static inline void skb_queue_purge(struct sk_buff_head *list) { skb_queue_purge_reason(list, SKB_DROP_REASON_QUEUE_PURGE); } unsigned int skb_rbtree_purge(struct rb_root *root); void skb_errqueue_purge(struct sk_buff_head *list); void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask); /** * netdev_alloc_frag - allocate a page fragment * @fragsz: fragment size * * Allocates a frag from a page for receive buffer. * Uses GFP_ATOMIC allocations. */ static inline void *netdev_alloc_frag(unsigned int fragsz) { return __netdev_alloc_frag_align(fragsz, ~0u); } static inline void *netdev_alloc_frag_align(unsigned int fragsz, unsigned int align) { WARN_ON_ONCE(!is_power_of_2(align)); return __netdev_alloc_frag_align(fragsz, -align); } struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int length, gfp_t gfp_mask); /** * netdev_alloc_skb - allocate an skbuff for rx on a specific device * @dev: network device to receive on * @length: length to allocate * * Allocate a new &sk_buff and assign it a usage count of one. The * buffer has unspecified headroom built in. Users should allocate * the headroom they think they need without accounting for the * built in space. The built in space is used for optimisations. * * %NULL is returned if there is no free memory. Although this function * allocates memory it can be called from an interrupt. */ static inline struct sk_buff *netdev_alloc_skb(struct net_device *dev, unsigned int length) { return __netdev_alloc_skb(dev, length, GFP_ATOMIC); } /* legacy helper around __netdev_alloc_skb() */ static inline struct sk_buff *__dev_alloc_skb(unsigned int length, gfp_t gfp_mask) { return __netdev_alloc_skb(NULL, length, gfp_mask); } /* legacy helper around netdev_alloc_skb() */ static inline struct sk_buff *dev_alloc_skb(unsigned int length) { return netdev_alloc_skb(NULL, length); } static inline struct sk_buff *__netdev_alloc_skb_ip_align(struct net_device *dev, unsigned int length, gfp_t gfp) { struct sk_buff *skb = __netdev_alloc_skb(dev, length + NET_IP_ALIGN, gfp); if (NET_IP_ALIGN && skb) skb_reserve(skb, NET_IP_ALIGN); return skb; } static inline struct sk_buff *netdev_alloc_skb_ip_align(struct net_device *dev, unsigned int length) { return __netdev_alloc_skb_ip_align(dev, length, GFP_ATOMIC); } static inline void skb_free_frag(void *addr) { page_frag_free(addr); } void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask); static inline void *napi_alloc_frag(unsigned int fragsz) { return __napi_alloc_frag_align(fragsz, ~0u); } static inline void *napi_alloc_frag_align(unsigned int fragsz, unsigned int align) { WARN_ON_ONCE(!is_power_of_2(align)); return __napi_alloc_frag_align(fragsz, -align); } struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int length, gfp_t gfp_mask); static inline struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int length) { return __napi_alloc_skb(napi, length, GFP_ATOMIC); } void napi_consume_skb(struct sk_buff *skb, int budget); void napi_skb_free_stolen_head(struct sk_buff *skb); void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason); /** * __dev_alloc_pages - allocate page for network Rx * @gfp_mask: allocation priority. Set __GFP_NOMEMALLOC if not for network Rx * @order: size of the allocation * * Allocate a new page. * * %NULL is returned if there is no free memory. */ static inline struct page *__dev_alloc_pages(gfp_t gfp_mask, unsigned int order) { /* This piece of code contains several assumptions. * 1. This is for device Rx, therefore a cold page is preferred. * 2. The expectation is the user wants a compound page. * 3. If requesting a order 0 page it will not be compound * due to the check to see if order has a value in prep_new_page * 4. __GFP_MEMALLOC is ignored if __GFP_NOMEMALLOC is set due to * code in gfp_to_alloc_flags that should be enforcing this. */ gfp_mask |= __GFP_COMP | __GFP_MEMALLOC; return alloc_pages_node(NUMA_NO_NODE, gfp_mask, order); } static inline struct page *dev_alloc_pages(unsigned int order) { return __dev_alloc_pages(GFP_ATOMIC | __GFP_NOWARN, order); } /** * __dev_alloc_page - allocate a page for network Rx * @gfp_mask: allocation priority. Set __GFP_NOMEMALLOC if not for network Rx * * Allocate a new page. * * %NULL is returned if there is no free memory. */ static inline struct page *__dev_alloc_page(gfp_t gfp_mask) { return __dev_alloc_pages(gfp_mask, 0); } static inline struct page *dev_alloc_page(void) { return dev_alloc_pages(0); } /** * dev_page_is_reusable - check whether a page can be reused for network Rx * @page: the page to test * * A page shouldn't be considered for reusing/recycling if it was allocated * under memory pressure or at a distant memory node. * * Returns false if this page should be returned to page allocator, true * otherwise. */ static inline bool dev_page_is_reusable(const struct page *page) { return likely(page_to_nid(page) == numa_mem_id() && !page_is_pfmemalloc(page)); } /** * skb_propagate_pfmemalloc - Propagate pfmemalloc if skb is allocated after RX page * @page: The page that was allocated from skb_alloc_page * @skb: The skb that may need pfmemalloc set */ static inline void skb_propagate_pfmemalloc(const struct page *page, struct sk_buff *skb) { if (page_is_pfmemalloc(page)) skb->pfmemalloc = true; } /** * skb_frag_off() - Returns the offset of a skb fragment * @frag: the paged fragment */ static inline unsigned int skb_frag_off(const skb_frag_t *frag) { return frag->bv_offset; } /** * skb_frag_off_add() - Increments the offset of a skb fragment by @delta * @frag: skb fragment * @delta: value to add */ static inline void skb_frag_off_add(skb_frag_t *frag, int delta) { frag->bv_offset += delta; } /** * skb_frag_off_set() - Sets the offset of a skb fragment * @frag: skb fragment * @offset: offset of fragment */ static inline void skb_frag_off_set(skb_frag_t *frag, unsigned int offset) { frag->bv_offset = offset; } /** * skb_frag_off_copy() - Sets the offset of a skb fragment from another fragment * @fragto: skb fragment where offset is set * @fragfrom: skb fragment offset is copied from */ static inline void skb_frag_off_copy(skb_frag_t *fragto, const skb_frag_t *fragfrom) { fragto->bv_offset = fragfrom->bv_offset; } /** * skb_frag_page - retrieve the page referred to by a paged fragment * @frag: the paged fragment * * Returns the &struct page associated with @frag. */ static inline struct page *skb_frag_page(const skb_frag_t *frag) { return frag->bv_page; } /** * __skb_frag_ref - take an addition reference on a paged fragment. * @frag: the paged fragment * * Takes an additional reference on the paged fragment @frag. */ static inline void __skb_frag_ref(skb_frag_t *frag) { get_page(skb_frag_page(frag)); } /** * skb_frag_ref - take an addition reference on a paged fragment of an skb. * @skb: the buffer * @f: the fragment offset. * * Takes an additional reference on the @f'th paged fragment of @skb. */ static inline void skb_frag_ref(struct sk_buff *skb, int f) { __skb_frag_ref(&skb_shinfo(skb)->frags[f]); } bool napi_pp_put_page(struct page *page, bool napi_safe); static inline void napi_frag_unref(skb_frag_t *frag, bool recycle, bool napi_safe) { struct page *page = skb_frag_page(frag); #ifdef CONFIG_PAGE_POOL if (recycle && napi_pp_put_page(page, napi_safe)) return; #endif put_page(page); } /** * __skb_frag_unref - release a reference on a paged fragment. * @frag: the paged fragment * @recycle: recycle the page if allocated via page_pool * * Releases a reference on the paged fragment @frag * or recycles the page via the page_pool API. */ static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle) { napi_frag_unref(frag, recycle, false); } /** * skb_frag_unref - release a reference on a paged fragment of an skb. * @skb: the buffer * @f: the fragment offset * * Releases a reference on the @f'th paged fragment of @skb. */ static inline void skb_frag_unref(struct sk_buff *skb, int f) { struct skb_shared_info *shinfo = skb_shinfo(skb); if (!skb_zcopy_managed(skb)) __skb_frag_unref(&shinfo->frags[f], skb->pp_recycle); } /** * skb_frag_address - gets the address of the data contained in a paged fragment * @frag: the paged fragment buffer * * Returns the address of the data within @frag. The page must already * be mapped. */ static inline void *skb_frag_address(const skb_frag_t *frag) { return page_address(skb_frag_page(frag)) + skb_frag_off(frag); } /** * skb_frag_address_safe - gets the address of the data contained in a paged fragment * @frag: the paged fragment buffer * * Returns the address of the data within @frag. Checks that the page * is mapped and returns %NULL otherwise. */ static inline void *skb_frag_address_safe(const skb_frag_t *frag) { void *ptr = page_address(skb_frag_page(frag)); if (unlikely(!ptr)) return NULL; return ptr + skb_frag_off(frag); } /** * skb_frag_page_copy() - sets the page in a fragment from another fragment * @fragto: skb fragment where page is set * @fragfrom: skb fragment page is copied from */ static inline void skb_frag_page_copy(skb_frag_t *fragto, const skb_frag_t *fragfrom) { fragto->bv_page = fragfrom->bv_page; } bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio); /** * skb_frag_dma_map - maps a paged fragment via the DMA API * @dev: the device to map the fragment to * @frag: the paged fragment to map * @offset: the offset within the fragment (starting at the * fragment's own offset) * @size: the number of bytes to map * @dir: the direction of the mapping (``PCI_DMA_*``) * * Maps the page associated with @frag to @device. */ static inline dma_addr_t skb_frag_dma_map(struct device *dev, const skb_frag_t *frag, size_t offset, size_t size, enum dma_data_direction dir) { return dma_map_page(dev, skb_frag_page(frag), skb_frag_off(frag) + offset, size, dir); } static inline struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask) { return __pskb_copy(skb, skb_headroom(skb), gfp_mask); } static inline struct sk_buff *pskb_copy_for_clone(struct sk_buff *skb, gfp_t gfp_mask) { return __pskb_copy_fclone(skb, skb_headroom(skb), gfp_mask, true); } /** * skb_clone_writable - is the header of a clone writable * @skb: buffer to check * @len: length up to which to write * * Returns true if modifying the header part of the cloned buffer * does not requires the data to be copied. */ static inline int skb_clone_writable(const struct sk_buff *skb, unsigned int len) { return !skb_header_cloned(skb) && skb_headroom(skb) + len <= skb->hdr_len; } static inline int skb_try_make_writable(struct sk_buff *skb, unsigned int write_len) { return skb_cloned(skb) && !skb_clone_writable(skb, write_len) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC); } static inline int __skb_cow(struct sk_buff *skb, unsigned int headroom, int cloned) { int delta = 0; if (headroom > skb_headroom(skb)) delta = headroom - skb_headroom(skb); if (delta || cloned) return pskb_expand_head(skb, ALIGN(delta, NET_SKB_PAD), 0, GFP_ATOMIC); return 0; } /** * skb_cow - copy header of skb when it is required * @skb: buffer to cow * @headroom: needed headroom * * If the skb passed lacks sufficient headroom or its data part * is shared, data is reallocated. If reallocation fails, an error * is returned and original skb is not changed. * * The result is skb with writable area skb->head...skb->tail * and at least @headroom of space at head. */ static inline int skb_cow(struct sk_buff *skb, unsigned int headroom) { return __skb_cow(skb, headroom, skb_cloned(skb)); } /** * skb_cow_head - skb_cow but only making the head writable * @skb: buffer to cow * @headroom: needed headroom * * This function is identical to skb_cow except that we replace the * skb_cloned check by skb_header_cloned. It should be used when * you only need to push on some header and do not need to modify * the data. */ static inline int skb_cow_head(struct sk_buff *skb, unsigned int headroom) { return __skb_cow(skb, headroom, skb_header_cloned(skb)); } /** * skb_padto - pad an skbuff up to a minimal size * @skb: buffer to pad * @len: minimal length * * Pads up a buffer to ensure the trailing bytes exist and are * blanked. If the buffer already contains sufficient data it * is untouched. Otherwise it is extended. Returns zero on * success. The skb is freed on error. */ static inline int skb_padto(struct sk_buff *skb, unsigned int len) { unsigned int size = skb->len; if (likely(size >= len)) return 0; return skb_pad(skb, len - size); } /** * __skb_put_padto - increase size and pad an skbuff up to a minimal size * @skb: buffer to pad * @len: minimal length * @free_on_error: free buffer on error * * Pads up a buffer to ensure the trailing bytes exist and are * blanked. If the buffer already contains sufficient data it * is untouched. Otherwise it is extended. Returns zero on * success. The skb is freed on error if @free_on_error is true. */ static inline int __must_check __skb_put_padto(struct sk_buff *skb, unsigned int len, bool free_on_error) { unsigned int size = skb->len; if (unlikely(size < len)) { len -= size; if (__skb_pad(skb, len, free_on_error)) return -ENOMEM; __skb_put(skb, len); } return 0; } /** * skb_put_padto - increase size and pad an skbuff up to a minimal size * @skb: buffer to pad * @len: minimal length * * Pads up a buffer to ensure the trailing bytes exist and are * blanked. If the buffer already contains sufficient data it * is untouched. Otherwise it is extended. Returns zero on * success. The skb is freed on error. */ static inline int __must_check skb_put_padto(struct sk_buff *skb, unsigned int len) { return __skb_put_padto(skb, len, true); } bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i) __must_check; static inline int skb_add_data(struct sk_buff *skb, struct iov_iter *from, int copy) { const int off = skb->len; if (skb->ip_summed == CHECKSUM_NONE) { __wsum csum = 0; if (csum_and_copy_from_iter_full(skb_put(skb, copy), copy, &csum, from)) { skb->csum = csum_block_add(skb->csum, csum, off); return 0; } } else if (copy_from_iter_full(skb_put(skb, copy), copy, from)) return 0; __skb_trim(skb, off); return -EFAULT; } static inline bool skb_can_coalesce(struct sk_buff *skb, int i, const struct page *page, int off) { if (skb_zcopy(skb)) return false; if (i) { const skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1]; return page == skb_frag_page(frag) && off == skb_frag_off(frag) + skb_frag_size(frag); } return false; } static inline int __skb_linearize(struct sk_buff *skb) { return __pskb_pull_tail(skb, skb->data_len) ? 0 : -ENOMEM; } /** * skb_linearize - convert paged skb to linear one * @skb: buffer to linarize * * If there is no free memory -ENOMEM is returned, otherwise zero * is returned and the old skb data released. */ static inline int skb_linearize(struct sk_buff *skb) { return skb_is_nonlinear(skb) ? __skb_linearize(skb) : 0; } /** * skb_has_shared_frag - can any frag be overwritten * @skb: buffer to test * * Return true if the skb has at least one frag that might be modified * by an external entity (as in vmsplice()/sendfile()) */ static inline bool skb_has_shared_frag(const struct sk_buff *skb) { return skb_is_nonlinear(skb) && skb_shinfo(skb)->flags & SKBFL_SHARED_FRAG; } /** * skb_linearize_cow - make sure skb is linear and writable * @skb: buffer to process * * If there is no free memory -ENOMEM is returned, otherwise zero * is returned and the old skb data released. */ static inline int skb_linearize_cow(struct sk_buff *skb) { return skb_is_nonlinear(skb) || skb_cloned(skb) ? __skb_linearize(skb) : 0; } static __always_inline void __skb_postpull_rcsum(struct sk_buff *skb, const void *start, unsigned int len, unsigned int off) { if (skb->ip_summed == CHECKSUM_COMPLETE) skb->csum = csum_block_sub(skb->csum, csum_partial(start, len, 0), off); else if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_start_offset(skb) < 0) skb->ip_summed = CHECKSUM_NONE; } /** * skb_postpull_rcsum - update checksum for received skb after pull * @skb: buffer to update * @start: start of data before pull * @len: length of data pulled * * After doing a pull on a received packet, you need to call this to * update the CHECKSUM_COMPLETE checksum, or set ip_summed to * CHECKSUM_NONE so that it can be recomputed from scratch. */ static inline void skb_postpull_rcsum(struct sk_buff *skb, const void *start, unsigned int len) { if (skb->ip_summed == CHECKSUM_COMPLETE) skb->csum = wsum_negate(csum_partial(start, len, wsum_negate(skb->csum))); else if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_start_offset(skb) < 0) skb->ip_summed = CHECKSUM_NONE; } static __always_inline void __skb_postpush_rcsum(struct sk_buff *skb, const void *start, unsigned int len, unsigned int off) { if (skb->ip_summed == CHECKSUM_COMPLETE) skb->csum = csum_block_add(skb->csum, csum_partial(start, len, 0), off); } /** * skb_postpush_rcsum - update checksum for received skb after push * @skb: buffer to update * @start: start of data after push * @len: length of data pushed * * After doing a push on a received packet, you need to call this to * update the CHECKSUM_COMPLETE checksum. */ static inline void skb_postpush_rcsum(struct sk_buff *skb, const void *start, unsigned int len) { __skb_postpush_rcsum(skb, start, len, 0); } void *skb_pull_rcsum(struct sk_buff *skb, unsigned int len); /** * skb_push_rcsum - push skb and update receive checksum * @skb: buffer to update * @len: length of data pulled * * This function performs an skb_push on the packet and updates * the CHECKSUM_COMPLETE checksum. It should be used on * receive path processing instead of skb_push unless you know * that the checksum difference is zero (e.g., a valid IP header) * or you are setting ip_summed to CHECKSUM_NONE. */ static inline void *skb_push_rcsum(struct sk_buff *skb, unsigned int len) { skb_push(skb, len); skb_postpush_rcsum(skb, skb->data, len); return skb->data; } int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len); /** * pskb_trim_rcsum - trim received skb and update checksum * @skb: buffer to trim * @len: new length * * This is exactly the same as pskb_trim except that it ensures the * checksum of received packets are still valid after the operation. * It can change skb pointers. */ static inline int pskb_trim_rcsum(struct sk_buff *skb, unsigned int len) { if (likely(len >= skb->len)) return 0; return pskb_trim_rcsum_slow(skb, len); } static inline int __skb_trim_rcsum(struct sk_buff *skb, unsigned int len) { if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE; __skb_trim(skb, len); return 0; } static inline int __skb_grow_rcsum(struct sk_buff *skb, unsigned int len) { if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE; return __skb_grow(skb, len); } #define rb_to_skb(rb) rb_entry_safe(rb, struct sk_buff, rbnode) #define skb_rb_first(root) rb_to_skb(rb_first(root)) #define skb_rb_last(root) rb_to_skb(rb_last(root)) #define skb_rb_next(skb) rb_to_skb(rb_next(&(skb)->rbnode)) #define skb_rb_prev(skb) rb_to_skb(rb_prev(&(skb)->rbnode)) #define skb_queue_walk(queue, skb) \ for (skb = (queue)->next; \ skb != (struct sk_buff *)(queue); \ skb = skb->next) #define skb_queue_walk_safe(queue, skb, tmp) \ for (skb = (queue)->next, tmp = skb->next; \ skb != (struct sk_buff *)(queue); \ skb = tmp, tmp = skb->next) #define skb_queue_walk_from(queue, skb) \ for (; skb != (struct sk_buff *)(queue); \ skb = skb->next) #define skb_rbtree_walk(skb, root) \ for (skb = skb_rb_first(root); skb != NULL; \ skb = skb_rb_next(skb)) #define skb_rbtree_walk_from(skb) \ for (; skb != NULL; \ skb = skb_rb_next(skb)) #define skb_rbtree_walk_from_safe(skb, tmp) \ for (; tmp = skb ? skb_rb_next(skb) : NULL, (skb != NULL); \ skb = tmp) #define skb_queue_walk_from_safe(queue, skb, tmp) \ for (tmp = skb->next; \ skb != (struct sk_buff *)(queue); \ skb = tmp, tmp = skb->next) #define skb_queue_reverse_walk(queue, skb) \ for (skb = (queue)->prev; \ skb != (struct sk_buff *)(queue); \ skb = skb->prev) #define skb_queue_reverse_walk_safe(queue, skb, tmp) \ for (skb = (queue)->prev, tmp = skb->prev; \ skb != (struct sk_buff *)(queue); \ skb = tmp, tmp = skb->prev) #define skb_queue_reverse_walk_from_safe(queue, skb, tmp) \ for (tmp = skb->prev; \ skb != (struct sk_buff *)(queue); \ skb = tmp, tmp = skb->prev) static inline bool skb_has_frag_list(const struct sk_buff *skb) { return skb_shinfo(skb)->frag_list != NULL; } static inline void skb_frag_list_init(struct sk_buff *skb) { skb_shinfo(skb)->frag_list = NULL; } #define skb_walk_frags(skb, iter) \ for (iter = skb_shinfo(skb)->frag_list; iter; iter = iter->next) int __skb_wait_for_more_packets(struct sock *sk, struct sk_buff_head *queue, int *err, long *timeo_p, const struct sk_buff *skb); struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, struct sk_buff_head *queue, unsigned int flags, int *off, int *err, struct sk_buff **last); struct sk_buff *__skb_try_recv_datagram(struct sock *sk, struct sk_buff_head *queue, unsigned int flags, int *off, int *err, struct sk_buff **last); struct sk_buff *__skb_recv_datagram(struct sock *sk, struct sk_buff_head *sk_queue, unsigned int flags, int *off, int *err); struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags, int *err); __poll_t datagram_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait); int skb_copy_datagram_iter(const struct sk_buff *from, int offset, struct iov_iter *to, int size); static inline int skb_copy_datagram_msg(const struct sk_buff *from, int offset, struct msghdr *msg, int size) { return skb_copy_datagram_iter(from, offset, &msg->msg_iter, size); } int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, int hlen, struct msghdr *msg); int skb_copy_and_hash_datagram_iter(const struct sk_buff *skb, int offset, struct iov_iter *to, int len, struct ahash_request *hash); int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset, struct iov_iter *from, int len); int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *frm); void skb_free_datagram(struct sock *sk, struct sk_buff *skb); void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len); static inline void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb) { __skb_free_datagram_locked(sk, skb, 0); } int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags); int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len); int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len); __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, u8 *to, int len); int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, struct pipe_inode_info *pipe, unsigned int len, unsigned int flags); int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, int len); int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len); void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to); unsigned int skb_zerocopy_headlen(const struct sk_buff *from); int skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen); void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len); int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen); void skb_scrub_packet(struct sk_buff *skb, bool xnet); struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features); struct sk_buff *skb_segment_list(struct sk_buff *skb, netdev_features_t features, unsigned int offset); struct sk_buff *skb_vlan_untag(struct sk_buff *skb); int skb_ensure_writable(struct sk_buff *skb, unsigned int write_len); int skb_ensure_writable_head_tail(struct sk_buff *skb, struct net_device *dev); int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci); int skb_vlan_pop(struct sk_buff *skb); int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci); int skb_eth_pop(struct sk_buff *skb); int skb_eth_push(struct sk_buff *skb, const unsigned char *dst, const unsigned char *src); int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto, int mac_len, bool ethernet); int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len, bool ethernet); int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse); int skb_mpls_dec_ttl(struct sk_buff *skb); struct sk_buff *pskb_extract(struct sk_buff *skb, int off, int to_copy, gfp_t gfp); static inline int memcpy_from_msg(void *data, struct msghdr *msg, int len) { return copy_from_iter_full(data, len, &msg->msg_iter) ? 0 : -EFAULT; } static inline int memcpy_to_msg(struct msghdr *msg, void *data, int len) { return copy_to_iter(data, len, &msg->msg_iter) == len ? 0 : -EFAULT; } struct skb_checksum_ops { __wsum (*update)(const void *mem, int len, __wsum wsum); __wsum (*combine)(__wsum csum, __wsum csum2, int offset, int len); }; extern const struct skb_checksum_ops *crc32c_csum_stub __read_mostly; __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum csum, const struct skb_checksum_ops *ops); __wsum skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum csum); static inline void * __must_check __skb_header_pointer(const struct sk_buff *skb, int offset, int len, const void *data, int hlen, void *buffer) { if (likely(hlen - offset >= len)) return (void *)data + offset; if (!skb || unlikely(skb_copy_bits(skb, offset, buffer, len) < 0)) return NULL; return buffer; } static inline void * __must_check skb_header_pointer(const struct sk_buff *skb, int offset, int len, void *buffer) { return __skb_header_pointer(skb, offset, len, skb->data, skb_headlen(skb), buffer); } static inline void * __must_check skb_pointer_if_linear(const struct sk_buff *skb, int offset, int len) { if (likely(skb_headlen(skb) - offset >= len)) return skb->data + offset; return NULL; } /** * skb_needs_linearize - check if we need to linearize a given skb * depending on the given device features. * @skb: socket buffer to check * @features: net device features * * Returns true if either: * 1. skb has frag_list and the device doesn't support FRAGLIST, or * 2. skb is fragmented and the device does not support SG. */ static inline bool skb_needs_linearize(struct sk_buff *skb, netdev_features_t features) { return skb_is_nonlinear(skb) && ((skb_has_frag_list(skb) && !(features & NETIF_F_FRAGLIST)) || (skb_shinfo(skb)->nr_frags && !(features & NETIF_F_SG))); } static inline void skb_copy_from_linear_data(const struct sk_buff *skb, void *to, const unsigned int len) { memcpy(to, skb->data, len); } static inline void skb_copy_from_linear_data_offset(const struct sk_buff *skb, const int offset, void *to, const unsigned int len) { memcpy(to, skb->data + offset, len); } static inline void skb_copy_to_linear_data(struct sk_buff *skb, const void *from, const unsigned int len) { memcpy(skb->data, from, len); } static inline void skb_copy_to_linear_data_offset(struct sk_buff *skb, const int offset, const void *from, const unsigned int len) { memcpy(skb->data + offset, from, len); } void skb_init(void); static inline ktime_t skb_get_ktime(const struct sk_buff *skb) { return skb->tstamp; } /** * skb_get_timestamp - get timestamp from a skb * @skb: skb to get stamp from * @stamp: pointer to struct __kernel_old_timeval to store stamp in * * Timestamps are stored in the skb as offsets to a base timestamp. * This function converts the offset back to a struct timeval and stores * it in stamp. */ static inline void skb_get_timestamp(const struct sk_buff *skb, struct __kernel_old_timeval *stamp) { *stamp = ns_to_kernel_old_timeval(skb->tstamp); } static inline void skb_get_new_timestamp(const struct sk_buff *skb, struct __kernel_sock_timeval *stamp) { struct timespec64 ts = ktime_to_timespec64(skb->tstamp); stamp->tv_sec = ts.tv_sec; stamp->tv_usec = ts.tv_nsec / 1000; } static inline void skb_get_timestampns(const struct sk_buff *skb, struct __kernel_old_timespec *stamp) { struct timespec64 ts = ktime_to_timespec64(skb->tstamp); stamp->tv_sec = ts.tv_sec; stamp->tv_nsec = ts.tv_nsec; } static inline void skb_get_new_timestampns(const struct sk_buff *skb, struct __kernel_timespec *stamp) { struct timespec64 ts = ktime_to_timespec64(skb->tstamp); stamp->tv_sec = ts.tv_sec; stamp->tv_nsec = ts.tv_nsec; } static inline void __net_timestamp(struct sk_buff *skb) { skb->tstamp = ktime_get_real(); skb->mono_delivery_time = 0; } static inline ktime_t net_timedelta(ktime_t t) { return ktime_sub(ktime_get_real(), t); } static inline void skb_set_delivery_time(struct sk_buff *skb, ktime_t kt, bool mono) { skb->tstamp = kt; skb->mono_delivery_time = kt && mono; } DECLARE_STATIC_KEY_FALSE(netstamp_needed_key); /* It is used in the ingress path to clear the delivery_time. * If needed, set the skb->tstamp to the (rcv) timestamp. */ static inline void skb_clear_delivery_time(struct sk_buff *skb) { if (skb->mono_delivery_time) { skb->mono_delivery_time = 0; if (static_branch_unlikely(&netstamp_needed_key)) skb->tstamp = ktime_get_real(); else skb->tstamp = 0; } } static inline void skb_clear_tstamp(struct sk_buff *skb) { if (skb->mono_delivery_time) return; skb->tstamp = 0; } static inline ktime_t skb_tstamp(const struct sk_buff *skb) { if (skb->mono_delivery_time) return 0; return skb->tstamp; } static inline ktime_t skb_tstamp_cond(const struct sk_buff *skb, bool cond) { if (!skb->mono_delivery_time && skb->tstamp) return skb->tstamp; if (static_branch_unlikely(&netstamp_needed_key) || cond) return ktime_get_real(); return 0; } static inline u8 skb_metadata_len(const struct sk_buff *skb) { return skb_shinfo(skb)->meta_len; } static inline void *skb_metadata_end(const struct sk_buff *skb) { return skb_mac_header(skb); } static inline bool __skb_metadata_differs(const struct sk_buff *skb_a, const struct sk_buff *skb_b, u8 meta_len) { const void *a = skb_metadata_end(skb_a); const void *b = skb_metadata_end(skb_b); u64 diffs = 0; if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) || BITS_PER_LONG != 64) goto slow; /* Using more efficient variant than plain call to memcmp(). */ switch (meta_len) { #define __it(x, op) (x -= sizeof(u##op)) #define __it_diff(a, b, op) (*(u##op *)__it(a, op)) ^ (*(u##op *)__it(b, op)) case 32: diffs |= __it_diff(a, b, 64); fallthrough; case 24: diffs |= __it_diff(a, b, 64); fallthrough; case 16: diffs |= __it_diff(a, b, 64); fallthrough; case 8: diffs |= __it_diff(a, b, 64); break; case 28: diffs |= __it_diff(a, b, 64); fallthrough; case 20: diffs |= __it_diff(a, b, 64); fallthrough; case 12: diffs |= __it_diff(a, b, 64); fallthrough; case 4: diffs |= __it_diff(a, b, 32); break; default: slow: return memcmp(a - meta_len, b - meta_len, meta_len); } return diffs; } static inline bool skb_metadata_differs(const struct sk_buff *skb_a, const struct sk_buff *skb_b) { u8 len_a = skb_metadata_len(skb_a); u8 len_b = skb_metadata_len(skb_b); if (!(len_a | len_b)) return false; return len_a != len_b ? true : __skb_metadata_differs(skb_a, skb_b, len_a); } static inline void skb_metadata_set(struct sk_buff *skb, u8 meta_len) { skb_shinfo(skb)->meta_len = meta_len; } static inline void skb_metadata_clear(struct sk_buff *skb) { skb_metadata_set(skb, 0); } struct sk_buff *skb_clone_sk(struct sk_buff *skb); #ifdef CONFIG_NETWORK_PHY_TIMESTAMPING void skb_clone_tx_timestamp(struct sk_buff *skb); bool skb_defer_rx_timestamp(struct sk_buff *skb); #else /* CONFIG_NETWORK_PHY_TIMESTAMPING */ static inline void skb_clone_tx_timestamp(struct sk_buff *skb) { } static inline bool skb_defer_rx_timestamp(struct sk_buff *skb) { return false; } #endif /* !CONFIG_NETWORK_PHY_TIMESTAMPING */ /** * skb_complete_tx_timestamp() - deliver cloned skb with tx timestamps * * PHY drivers may accept clones of transmitted packets for * timestamping via their phy_driver.txtstamp method. These drivers * must call this function to return the skb back to the stack with a * timestamp. * * @skb: clone of the original outgoing packet * @hwtstamps: hardware time stamps * */ void skb_complete_tx_timestamp(struct sk_buff *skb, struct skb_shared_hwtstamps *hwtstamps); void __skb_tstamp_tx(struct sk_buff *orig_skb, const struct sk_buff *ack_skb, struct skb_shared_hwtstamps *hwtstamps, struct sock *sk, int tstype); /** * skb_tstamp_tx - queue clone of skb with send time stamps * @orig_skb: the original outgoing packet * @hwtstamps: hardware time stamps, may be NULL if not available * * If the skb has a socket associated, then this function clones the * skb (thus sharing the actual data and optional structures), stores * the optional hardware time stamping information (if non NULL) or * generates a software time stamp (otherwise), then queues the clone * to the error queue of the socket. Errors are silently ignored. */ void skb_tstamp_tx(struct sk_buff *orig_skb, struct skb_shared_hwtstamps *hwtstamps); /** * skb_tx_timestamp() - Driver hook for transmit timestamping * * Ethernet MAC Drivers should call this function in their hard_xmit() * function immediately before giving the sk_buff to the MAC hardware. * * Specifically, one should make absolutely sure that this function is * called before TX completion of this packet can trigger. Otherwise * the packet could potentially already be freed. * * @skb: A socket buffer. */ static inline void skb_tx_timestamp(struct sk_buff *skb) { skb_clone_tx_timestamp(skb); if (skb_shinfo(skb)->tx_flags & SKBTX_SW_TSTAMP) skb_tstamp_tx(skb, NULL); } /** * skb_complete_wifi_ack - deliver skb with wifi status * * @skb: the original outgoing packet * @acked: ack status * */ void skb_complete_wifi_ack(struct sk_buff *skb, bool acked); __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len); __sum16 __skb_checksum_complete(struct sk_buff *skb); static inline int skb_csum_unnecessary(const struct sk_buff *skb) { return ((skb->ip_summed == CHECKSUM_UNNECESSARY) || skb->csum_valid || (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_start_offset(skb) >= 0)); } /** * skb_checksum_complete - Calculate checksum of an entire packet * @skb: packet to process * * This function calculates the checksum over the entire packet plus * the value of skb->csum. The latter can be used to supply the * checksum of a pseudo header as used by TCP/UDP. It returns the * checksum. * * For protocols that contain complete checksums such as ICMP/TCP/UDP, * this function can be used to verify that checksum on received * packets. In that case the function should return zero if the * checksum is correct. In particular, this function will return zero * if skb->ip_summed is CHECKSUM_UNNECESSARY which indicates that the * hardware has already verified the correctness of the checksum. */ static inline __sum16 skb_checksum_complete(struct sk_buff *skb) { return skb_csum_unnecessary(skb) ? 0 : __skb_checksum_complete(skb); } static inline void __skb_decr_checksum_unnecessary(struct sk_buff *skb) { if (skb->ip_summed == CHECKSUM_UNNECESSARY) { if (skb->csum_level == 0) skb->ip_summed = CHECKSUM_NONE; else skb->csum_level--; } } static inline void __skb_incr_checksum_unnecessary(struct sk_buff *skb) { if (skb->ip_summed == CHECKSUM_UNNECESSARY) { if (skb->csum_level < SKB_MAX_CSUM_LEVEL) skb->csum_level++; } else if (skb->ip_summed == CHECKSUM_NONE) { skb->ip_summed = CHECKSUM_UNNECESSARY; skb->csum_level = 0; } } static inline void __skb_reset_checksum_unnecessary(struct sk_buff *skb) { if (skb->ip_summed == CHECKSUM_UNNECESSARY) { skb->ip_summed = CHECKSUM_NONE; skb->csum_level = 0; } } /* Check if we need to perform checksum complete validation. * * Returns true if checksum complete is needed, false otherwise * (either checksum is unnecessary or zero checksum is allowed). */ static inline bool __skb_checksum_validate_needed(struct sk_buff *skb, bool zero_okay, __sum16 check) { if (skb_csum_unnecessary(skb) || (zero_okay && !check)) { skb->csum_valid = 1; __skb_decr_checksum_unnecessary(skb); return false; } return true; } /* For small packets <= CHECKSUM_BREAK perform checksum complete directly * in checksum_init. */ #define CHECKSUM_BREAK 76 /* Unset checksum-complete * * Unset checksum complete can be done when packet is being modified * (uncompressed for instance) and checksum-complete value is * invalidated. */ static inline void skb_checksum_complete_unset(struct sk_buff *skb) { if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE; } /* Validate (init) checksum based on checksum complete. * * Return values: * 0: checksum is validated or try to in skb_checksum_complete. In the latter * case the ip_summed will not be CHECKSUM_UNNECESSARY and the pseudo * checksum is stored in skb->csum for use in __skb_checksum_complete * non-zero: value of invalid checksum * */ static inline __sum16 __skb_checksum_validate_complete(struct sk_buff *skb, bool complete, __wsum psum) { if (skb->ip_summed == CHECKSUM_COMPLETE) { if (!csum_fold(csum_add(psum, skb->csum))) { skb->csum_valid = 1; return 0; } } skb->csum = psum; if (complete || skb->len <= CHECKSUM_BREAK) { __sum16 csum; csum = __skb_checksum_complete(skb); skb->csum_valid = !csum; return csum; } return 0; } static inline __wsum null_compute_pseudo(struct sk_buff *skb, int proto) { return 0; } /* Perform checksum validate (init). Note that this is a macro since we only * want to calculate the pseudo header which is an input function if necessary. * First we try to validate without any computation (checksum unnecessary) and * then calculate based on checksum complete calling the function to compute * pseudo header. * * Return values: * 0: checksum is validated or try to in skb_checksum_complete * non-zero: value of invalid checksum */ #define __skb_checksum_validate(skb, proto, complete, \ zero_okay, check, compute_pseudo) \ ({ \ __sum16 __ret = 0; \ skb->csum_valid = 0; \ if (__skb_checksum_validate_needed(skb, zero_okay, check)) \ __ret = __skb_checksum_validate_complete(skb, \ complete, compute_pseudo(skb, proto)); \ __ret; \ }) #define skb_checksum_init(skb, proto, compute_pseudo) \ __skb_checksum_validate(skb, proto, false, false, 0, compute_pseudo) #define skb_checksum_init_zero_check(skb, proto, check, compute_pseudo) \ __skb_checksum_validate(skb, proto, false, true, check, compute_pseudo) #define skb_checksum_validate(skb, proto, compute_pseudo) \ __skb_checksum_validate(skb, proto, true, false, 0, compute_pseudo) #define skb_checksum_validate_zero_check(skb, proto, check, \ compute_pseudo) \ __skb_checksum_validate(skb, proto, true, true, check, compute_pseudo) #define skb_checksum_simple_validate(skb) \ __skb_checksum_validate(skb, 0, true, false, 0, null_compute_pseudo) static inline bool __skb_checksum_convert_check(struct sk_buff *skb) { return (skb->ip_summed == CHECKSUM_NONE && skb->csum_valid); } static inline void __skb_checksum_convert(struct sk_buff *skb, __wsum pseudo) { skb->csum = ~pseudo; skb->ip_summed = CHECKSUM_COMPLETE; } #define skb_checksum_try_convert(skb, proto, compute_pseudo) \ do { \ if (__skb_checksum_convert_check(skb)) \ __skb_checksum_convert(skb, compute_pseudo(skb, proto)); \ } while (0) static inline void skb_remcsum_adjust_partial(struct sk_buff *skb, void *ptr, u16 start, u16 offset) { skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = ((unsigned char *)ptr + start) - skb->head; skb->csum_offset = offset - start; } /* Update skbuf and packet to reflect the remote checksum offload operation. * When called, ptr indicates the starting point for skb->csum when * ip_summed is CHECKSUM_COMPLETE. If we need create checksum complete * here, skb_postpull_rcsum is done so skb->csum start is ptr. */ static inline void skb_remcsum_process(struct sk_buff *skb, void *ptr, int start, int offset, bool nopartial) { __wsum delta; if (!nopartial) { skb_remcsum_adjust_partial(skb, ptr, start, offset); return; } if (unlikely(skb->ip_summed != CHECKSUM_COMPLETE)) { __skb_checksum_complete(skb); skb_postpull_rcsum(skb, skb->data, ptr - (void *)skb->data); } delta = remcsum_adjust(ptr, skb->csum, start, offset); /* Adjust skb->csum since we changed the packet */ skb->csum = csum_add(skb->csum, delta); } static inline struct nf_conntrack *skb_nfct(const struct sk_buff *skb) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) return (void *)(skb->_nfct & NFCT_PTRMASK); #else return NULL; #endif } static inline unsigned long skb_get_nfct(const struct sk_buff *skb) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) return skb->_nfct; #else return 0UL; #endif } static inline void skb_set_nfct(struct sk_buff *skb, unsigned long nfct) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) skb->slow_gro |= !!nfct; skb->_nfct = nfct; #endif } #ifdef CONFIG_SKB_EXTENSIONS enum skb_ext_id { #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) SKB_EXT_BRIDGE_NF, #endif #ifdef CONFIG_XFRM SKB_EXT_SEC_PATH, #endif #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) TC_SKB_EXT, #endif #if IS_ENABLED(CONFIG_MPTCP) SKB_EXT_MPTCP, #endif #if IS_ENABLED(CONFIG_MCTP_FLOWS) SKB_EXT_MCTP, #endif SKB_EXT_NUM, /* must be last */ }; /** * struct skb_ext - sk_buff extensions * @refcnt: 1 on allocation, deallocated on 0 * @offset: offset to add to @data to obtain extension address * @chunks: size currently allocated, stored in SKB_EXT_ALIGN_SHIFT units * @data: start of extension data, variable sized * * Note: offsets/lengths are stored in chunks of 8 bytes, this allows * to use 'u8' types while allowing up to 2kb worth of extension data. */ struct skb_ext { refcount_t refcnt; u8 offset[SKB_EXT_NUM]; /* in chunks of 8 bytes */ u8 chunks; /* same */ char data[] __aligned(8); }; struct skb_ext *__skb_ext_alloc(gfp_t flags); void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id, struct skb_ext *ext); void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id); void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id); void __skb_ext_put(struct skb_ext *ext); static inline void skb_ext_put(struct sk_buff *skb) { if (skb->active_extensions) __skb_ext_put(skb->extensions); } static inline void __skb_ext_copy(struct sk_buff *dst, const struct sk_buff *src) { dst->active_extensions = src->active_extensions; if (src->active_extensions) { struct skb_ext *ext = src->extensions; refcount_inc(&ext->refcnt); dst->extensions = ext; } } static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *src) { skb_ext_put(dst); __skb_ext_copy(dst, src); } static inline bool __skb_ext_exist(const struct skb_ext *ext, enum skb_ext_id i) { return !!ext->offset[i]; } static inline bool skb_ext_exist(const struct sk_buff *skb, enum skb_ext_id id) { return skb->active_extensions & (1 << id); } static inline void skb_ext_del(struct sk_buff *skb, enum skb_ext_id id) { if (skb_ext_exist(skb, id)) __skb_ext_del(skb, id); } static inline void *skb_ext_find(const struct sk_buff *skb, enum skb_ext_id id) { if (skb_ext_exist(skb, id)) { struct skb_ext *ext = skb->extensions; return (void *)ext + (ext->offset[id] << 3); } return NULL; } static inline void skb_ext_reset(struct sk_buff *skb) { if (unlikely(skb->active_extensions)) { __skb_ext_put(skb->extensions); skb->active_extensions = 0; } } static inline bool skb_has_extensions(struct sk_buff *skb) { return unlikely(skb->active_extensions); } #else static inline void skb_ext_put(struct sk_buff *skb) {} static inline void skb_ext_reset(struct sk_buff *skb) {} static inline void skb_ext_del(struct sk_buff *skb, int unused) {} static inline void __skb_ext_copy(struct sk_buff *d, const struct sk_buff *s) {} static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *s) {} static inline bool skb_has_extensions(struct sk_buff *skb) { return false; } #endif /* CONFIG_SKB_EXTENSIONS */ static inline void nf_reset_ct(struct sk_buff *skb) { #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) nf_conntrack_put(skb_nfct(skb)); skb->_nfct = 0; #endif } static inline void nf_reset_trace(struct sk_buff *skb) { #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || IS_ENABLED(CONFIG_NF_TABLES) skb->nf_trace = 0; #endif } static inline void ipvs_reset(struct sk_buff *skb) { #if IS_ENABLED(CONFIG_IP_VS) skb->ipvs_property = 0; #endif } /* Note: This doesn't put any conntrack info in dst. */ static inline void __nf_copy(struct sk_buff *dst, const struct sk_buff *src, bool copy) { #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) dst->_nfct = src->_nfct; nf_conntrack_get(skb_nfct(src)); #endif #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || IS_ENABLED(CONFIG_NF_TABLES) if (copy) dst->nf_trace = src->nf_trace; #endif } static inline void nf_copy(struct sk_buff *dst, const struct sk_buff *src) { #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) nf_conntrack_put(skb_nfct(dst)); #endif dst->slow_gro = src->slow_gro; __nf_copy(dst, src, true); } #ifdef CONFIG_NETWORK_SECMARK static inline void skb_copy_secmark(struct sk_buff *to, const struct sk_buff *from) { to->secmark = from->secmark; } static inline void skb_init_secmark(struct sk_buff *skb) { skb->secmark = 0; } #else static inline void skb_copy_secmark(struct sk_buff *to, const struct sk_buff *from) { } static inline void skb_init_secmark(struct sk_buff *skb) { } #endif static inline int secpath_exists(const struct sk_buff *skb) { #ifdef CONFIG_XFRM return skb_ext_exist(skb, SKB_EXT_SEC_PATH); #else return 0; #endif } static inline bool skb_irq_freeable(const struct sk_buff *skb) { return !skb->destructor && !secpath_exists(skb) && !skb_nfct(skb) && !skb->_skb_refdst && !skb_has_frag_list(skb); } static inline void skb_set_queue_mapping(struct sk_buff *skb, u16 queue_mapping) { skb->queue_mapping = queue_mapping; } static inline u16 skb_get_queue_mapping(const struct sk_buff *skb) { return skb->queue_mapping; } static inline void skb_copy_queue_mapping(struct sk_buff *to, const struct sk_buff *from) { to->queue_mapping = from->queue_mapping; } static inline void skb_record_rx_queue(struct sk_buff *skb, u16 rx_queue) { skb->queue_mapping = rx_queue + 1; } static inline u16 skb_get_rx_queue(const struct sk_buff *skb) { return skb->queue_mapping - 1; } static inline bool skb_rx_queue_recorded(const struct sk_buff *skb) { return skb->queue_mapping != 0; } static inline void skb_set_dst_pending_confirm(struct sk_buff *skb, u32 val) { skb->dst_pending_confirm = val; } static inline bool skb_get_dst_pending_confirm(const struct sk_buff *skb) { return skb->dst_pending_confirm != 0; } static inline struct sec_path *skb_sec_path(const struct sk_buff *skb) { #ifdef CONFIG_XFRM return skb_ext_find(skb, SKB_EXT_SEC_PATH); #else return NULL; #endif } static inline bool skb_is_gso(const struct sk_buff *skb) { return skb_shinfo(skb)->gso_size; } /* Note: Should be called only if skb_is_gso(skb) is true */ static inline bool skb_is_gso_v6(const struct sk_buff *skb) { return skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6; } /* Note: Should be called only if skb_is_gso(skb) is true */ static inline bool skb_is_gso_sctp(const struct sk_buff *skb) { return skb_shinfo(skb)->gso_type & SKB_GSO_SCTP; } /* Note: Should be called only if skb_is_gso(skb) is true */ static inline bool skb_is_gso_tcp(const struct sk_buff *skb) { return skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6); } static inline void skb_gso_reset(struct sk_buff *skb) { skb_shinfo(skb)->gso_size = 0; skb_shinfo(skb)->gso_segs = 0; skb_shinfo(skb)->gso_type = 0; } static inline void skb_increase_gso_size(struct skb_shared_info *shinfo, u16 increment) { if (WARN_ON_ONCE(shinfo->gso_size == GSO_BY_FRAGS)) return; shinfo->gso_size += increment; } static inline void skb_decrease_gso_size(struct skb_shared_info *shinfo, u16 decrement) { if (WARN_ON_ONCE(shinfo->gso_size == GSO_BY_FRAGS)) return; shinfo->gso_size -= decrement; } void __skb_warn_lro_forwarding(const struct sk_buff *skb); static inline bool skb_warn_if_lro(const struct sk_buff *skb) { /* LRO sets gso_size but not gso_type, whereas if GSO is really * wanted then gso_type will be set. */ const struct skb_shared_info *shinfo = skb_shinfo(skb); if (skb_is_nonlinear(skb) && shinfo->gso_size != 0 && unlikely(shinfo->gso_type == 0)) { __skb_warn_lro_forwarding(skb); return true; } return false; } static inline void skb_forward_csum(struct sk_buff *skb) { /* Unfortunately we don't support this one. Any brave souls? */ if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE; } /** * skb_checksum_none_assert - make sure skb ip_summed is CHECKSUM_NONE * @skb: skb to check * * fresh skbs have their ip_summed set to CHECKSUM_NONE. * Instead of forcing ip_summed to CHECKSUM_NONE, we can * use this helper, to document places where we make this assertion. */ static inline void skb_checksum_none_assert(const struct sk_buff *skb) { DEBUG_NET_WARN_ON_ONCE(skb->ip_summed != CHECKSUM_NONE); } bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off); int skb_checksum_setup(struct sk_buff *skb, bool recalculate); struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb, unsigned int transport_len, __sum16(*skb_chkf)(struct sk_buff *skb)); /** * skb_head_is_locked - Determine if the skb->head is locked down * @skb: skb to check * * The head on skbs build around a head frag can be removed if they are * not cloned. This function returns true if the skb head is locked down * due to either being allocated via kmalloc, or by being a clone with * multiple references to the head. */ static inline bool skb_head_is_locked(const struct sk_buff *skb) { return !skb->head_frag || skb_cloned(skb); } /* Local Checksum Offload. * Compute outer checksum based on the assumption that the * inner checksum will be offloaded later. * See Documentation/networking/checksum-offloads.rst for * explanation of how this works. * Fill in outer checksum adjustment (e.g. with sum of outer * pseudo-header) before calling. * Also ensure that inner checksum is in linear data area. */ static inline __wsum lco_csum(struct sk_buff *skb) { unsigned char *csum_start = skb_checksum_start(skb); unsigned char *l4_hdr = skb_transport_header(skb); __wsum partial; /* Start with complement of inner checksum adjustment */ partial = ~csum_unfold(*(__force __sum16 *)(csum_start + skb->csum_offset)); /* Add in checksum of our headers (incl. outer checksum * adjustment filled in by caller) and return result. */ return csum_partial(l4_hdr, csum_start - l4_hdr, partial); } static inline bool skb_is_redirected(const struct sk_buff *skb) { return skb->redirected; } static inline void skb_set_redirected(struct sk_buff *skb, bool from_ingress) { skb->redirected = 1; #ifdef CONFIG_NET_REDIRECT skb->from_ingress = from_ingress; if (skb->from_ingress) skb_clear_tstamp(skb); #endif } static inline void skb_reset_redirect(struct sk_buff *skb) { skb->redirected = 0; } static inline void skb_set_redirected_noclear(struct sk_buff *skb, bool from_ingress) { skb->redirected = 1; #ifdef CONFIG_NET_REDIRECT skb->from_ingress = from_ingress; #endif } static inline bool skb_csum_is_sctp(struct sk_buff *skb) { #if IS_ENABLED(CONFIG_IP_SCTP) return skb->csum_not_inet; #else return 0; #endif } static inline void skb_reset_csum_not_inet(struct sk_buff *skb) { skb->ip_summed = CHECKSUM_NONE; #if IS_ENABLED(CONFIG_IP_SCTP) skb->csum_not_inet = 0; #endif } static inline void skb_set_kcov_handle(struct sk_buff *skb, const u64 kcov_handle) { #ifdef CONFIG_KCOV skb->kcov_handle = kcov_handle; #endif } static inline u64 skb_get_kcov_handle(struct sk_buff *skb) { #ifdef CONFIG_KCOV return skb->kcov_handle; #else return 0; #endif } static inline void skb_mark_for_recycle(struct sk_buff *skb) { #ifdef CONFIG_PAGE_POOL skb->pp_recycle = 1; #endif } ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter, ssize_t maxsize, gfp_t gfp); #endif /* __KERNEL__ */ #endif /* _LINUX_SKBUFF_H */ |
5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 | /* SPDX-License-Identifier: GPL-2.0-or-later */ #ifndef _TCP_AO_H #define _TCP_AO_H #define TCP_AO_KEY_ALIGN 1 #define __tcp_ao_key_align __aligned(TCP_AO_KEY_ALIGN) union tcp_ao_addr { struct in_addr a4; #if IS_ENABLED(CONFIG_IPV6) struct in6_addr a6; #endif }; struct tcp_ao_hdr { u8 kind; u8 length; u8 keyid; u8 rnext_keyid; }; struct tcp_ao_counters { atomic64_t pkt_good; atomic64_t pkt_bad; atomic64_t key_not_found; atomic64_t ao_required; atomic64_t dropped_icmp; }; struct tcp_ao_key { struct hlist_node node; union tcp_ao_addr addr; u8 key[TCP_AO_MAXKEYLEN] __tcp_ao_key_align; unsigned int tcp_sigpool_id; unsigned int digest_size; int l3index; u8 prefixlen; u8 family; u8 keylen; u8 keyflags; u8 sndid; u8 rcvid; u8 maclen; struct rcu_head rcu; atomic64_t pkt_good; atomic64_t pkt_bad; u8 traffic_keys[]; }; static inline u8 *rcv_other_key(struct tcp_ao_key *key) { return key->traffic_keys; } static inline u8 *snd_other_key(struct tcp_ao_key *key) { return key->traffic_keys + key->digest_size; } static inline int tcp_ao_maclen(const struct tcp_ao_key *key) { return key->maclen; } /* Use tcp_ao_len_aligned() for TCP header calculations */ static inline int tcp_ao_len(const struct tcp_ao_key *key) { return tcp_ao_maclen(key) + sizeof(struct tcp_ao_hdr); } static inline int tcp_ao_len_aligned(const struct tcp_ao_key *key) { return round_up(tcp_ao_len(key), 4); } static inline unsigned int tcp_ao_digest_size(struct tcp_ao_key *key) { return key->digest_size; } static inline int tcp_ao_sizeof_key(const struct tcp_ao_key *key) { return sizeof(struct tcp_ao_key) + (key->digest_size << 1); } struct tcp_ao_info { /* List of tcp_ao_key's */ struct hlist_head head; /* current_key and rnext_key aren't maintained on listen sockets. * Their purpose is to cache keys on established connections, * saving needless lookups. Never dereference any of them from * listen sockets. * ::current_key may change in RX to the key that was requested by * the peer, please use READ_ONCE()/WRITE_ONCE() in order to avoid * load/store tearing. * Do the same for ::rnext_key, if you don't hold socket lock * (it's changed only by userspace request in setsockopt()). */ struct tcp_ao_key *current_key; struct tcp_ao_key *rnext_key; struct tcp_ao_counters counters; u32 ao_required :1, accept_icmps :1, __unused :30; __be32 lisn; __be32 risn; /* Sequence Number Extension (SNE) are upper 4 bytes for SEQ, * that protect TCP-AO connection from replayed old TCP segments. * See RFC5925 (6.2). * In order to get correct SNE, there's a helper tcp_ao_compute_sne(). * It needs SEQ basis to understand whereabouts are lower SEQ numbers. * According to that basis vector, it can provide incremented SNE * when SEQ rolls over or provide decremented SNE when there's * a retransmitted segment from before-rolling over. * - for request sockets such basis is rcv_isn/snt_isn, which seems * good enough as it's unexpected to receive 4 Gbytes on reqsk. * - for full sockets the basis is rcv_nxt/snd_una. snd_una is * taken instead of snd_nxt as currently it's easier to track * in tcp_snd_una_update(), rather than updating SNE in all * WRITE_ONCE(tp->snd_nxt, ...) * - for time-wait sockets the basis is tw_rcv_nxt/tw_snd_nxt. * tw_snd_nxt is not expected to change, while tw_rcv_nxt may. */ u32 snd_sne; u32 rcv_sne; refcount_t refcnt; /* Protects twsk destruction */ struct rcu_head rcu; }; #ifdef CONFIG_TCP_MD5SIG #include <linux/jump_label.h> extern struct static_key_false_deferred tcp_md5_needed; #define static_branch_tcp_md5() static_branch_unlikely(&tcp_md5_needed.key) #else #define static_branch_tcp_md5() false #endif #ifdef CONFIG_TCP_AO /* TCP-AO structures and functions */ #include <linux/jump_label.h> extern struct static_key_false_deferred tcp_ao_needed; #define static_branch_tcp_ao() static_branch_unlikely(&tcp_ao_needed.key) #else #define static_branch_tcp_ao() false #endif static inline bool tcp_hash_should_produce_warnings(void) { return static_branch_tcp_md5() || static_branch_tcp_ao(); } #define tcp_hash_fail(msg, family, skb, fmt, ...) \ do { \ const struct tcphdr *th = tcp_hdr(skb); \ char hdr_flags[6]; \ char *f = hdr_flags; \ \ if (!tcp_hash_should_produce_warnings()) \ break; \ if (th->fin) \ *f++ = 'F'; \ if (th->syn) \ *f++ = 'S'; \ if (th->rst) \ *f++ = 'R'; \ if (th->psh) \ *f++ = 'P'; \ if (th->ack) \ *f++ = '.'; \ *f = 0; \ if ((family) == AF_INET) { \ net_info_ratelimited("%s for %pI4.%d->%pI4.%d [%s] " fmt "\n", \ msg, &ip_hdr(skb)->saddr, ntohs(th->source), \ &ip_hdr(skb)->daddr, ntohs(th->dest), \ hdr_flags, ##__VA_ARGS__); \ } else { \ net_info_ratelimited("%s for [%pI6c].%d->[%pI6c].%d [%s]" fmt "\n", \ msg, &ipv6_hdr(skb)->saddr, ntohs(th->source), \ &ipv6_hdr(skb)->daddr, ntohs(th->dest), \ hdr_flags, ##__VA_ARGS__); \ } \ } while (0) #ifdef CONFIG_TCP_AO /* TCP-AO structures and functions */ struct tcp4_ao_context { __be32 saddr; __be32 daddr; __be16 sport; __be16 dport; __be32 sisn; __be32 disn; }; struct tcp6_ao_context { struct in6_addr saddr; struct in6_addr daddr; __be16 sport; __be16 dport; __be32 sisn; __be32 disn; }; struct tcp_sigpool; #define TCP_AO_ESTABLISHED (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | \ TCPF_CLOSE | TCPF_CLOSE_WAIT | \ TCPF_LAST_ACK | TCPF_CLOSING) int tcp_ao_transmit_skb(struct sock *sk, struct sk_buff *skb, struct tcp_ao_key *key, struct tcphdr *th, __u8 *hash_location); int tcp_ao_hash_skb(unsigned short int family, char *ao_hash, struct tcp_ao_key *key, const struct sock *sk, const struct sk_buff *skb, const u8 *tkey, int hash_offset, u32 sne); int tcp_parse_ao(struct sock *sk, int cmd, unsigned short int family, sockptr_t optval, int optlen); struct tcp_ao_key *tcp_ao_established_key(struct tcp_ao_info *ao, int sndid, int rcvid); int tcp_ao_copy_all_matching(const struct sock *sk, struct sock *newsk, struct request_sock *req, struct sk_buff *skb, int family); int tcp_ao_calc_traffic_key(struct tcp_ao_key *mkt, u8 *key, void *ctx, unsigned int len, struct tcp_sigpool *hp); void tcp_ao_destroy_sock(struct sock *sk, bool twsk); void tcp_ao_time_wait(struct tcp_timewait_sock *tcptw, struct tcp_sock *tp); bool tcp_ao_ignore_icmp(const struct sock *sk, int family, int type, int code); int tcp_ao_get_mkts(struct sock *sk, sockptr_t optval, sockptr_t optlen); int tcp_ao_get_sock_info(struct sock *sk, sockptr_t optval, sockptr_t optlen); int tcp_ao_get_repair(struct sock *sk, sockptr_t optval, sockptr_t optlen); int tcp_ao_set_repair(struct sock *sk, sockptr_t optval, unsigned int optlen); enum skb_drop_reason tcp_inbound_ao_hash(struct sock *sk, const struct sk_buff *skb, unsigned short int family, const struct request_sock *req, int l3index, const struct tcp_ao_hdr *aoh); u32 tcp_ao_compute_sne(u32 next_sne, u32 next_seq, u32 seq); struct tcp_ao_key *tcp_ao_do_lookup(const struct sock *sk, int l3index, const union tcp_ao_addr *addr, int family, int sndid, int rcvid); int tcp_ao_hash_hdr(unsigned short family, char *ao_hash, struct tcp_ao_key *key, const u8 *tkey, const union tcp_ao_addr *daddr, const union tcp_ao_addr *saddr, const struct tcphdr *th, u32 sne); int tcp_ao_prepare_reset(const struct sock *sk, struct sk_buff *skb, const struct tcp_ao_hdr *aoh, int l3index, u32 seq, struct tcp_ao_key **key, char **traffic_key, bool *allocated_traffic_key, u8 *keyid, u32 *sne); /* ipv4 specific functions */ int tcp_v4_parse_ao(struct sock *sk, int cmd, sockptr_t optval, int optlen); struct tcp_ao_key *tcp_v4_ao_lookup(const struct sock *sk, struct sock *addr_sk, int sndid, int rcvid); int tcp_v4_ao_synack_hash(char *ao_hash, struct tcp_ao_key *mkt, struct request_sock *req, const struct sk_buff *skb, int hash_offset, u32 sne); int tcp_v4_ao_calc_key_sk(struct tcp_ao_key *mkt, u8 *key, const struct sock *sk, __be32 sisn, __be32 disn, bool send); int tcp_v4_ao_calc_key_rsk(struct tcp_ao_key *mkt, u8 *key, struct request_sock *req); struct tcp_ao_key *tcp_v4_ao_lookup_rsk(const struct sock *sk, struct request_sock *req, int sndid, int rcvid); int tcp_v4_ao_hash_skb(char *ao_hash, struct tcp_ao_key *key, const struct sock *sk, const struct sk_buff *skb, const u8 *tkey, int hash_offset, u32 sne); /* ipv6 specific functions */ int tcp_v6_ao_hash_pseudoheader(struct tcp_sigpool *hp, const struct in6_addr *daddr, const struct in6_addr *saddr, int nbytes); int tcp_v6_ao_calc_key_skb(struct tcp_ao_key *mkt, u8 *key, const struct sk_buff *skb, __be32 sisn, __be32 disn); int tcp_v6_ao_calc_key_sk(struct tcp_ao_key *mkt, u8 *key, const struct sock *sk, __be32 sisn, __be32 disn, bool send); int tcp_v6_ao_calc_key_rsk(struct tcp_ao_key *mkt, u8 *key, struct request_sock *req); struct tcp_ao_key *tcp_v6_ao_lookup(const struct sock *sk, struct sock *addr_sk, int sndid, int rcvid); struct tcp_ao_key *tcp_v6_ao_lookup_rsk(const struct sock *sk, struct request_sock *req, int sndid, int rcvid); int tcp_v6_ao_hash_skb(char *ao_hash, struct tcp_ao_key *key, const struct sock *sk, const struct sk_buff *skb, const u8 *tkey, int hash_offset, u32 sne); int tcp_v6_parse_ao(struct sock *sk, int cmd, sockptr_t optval, int optlen); int tcp_v6_ao_synack_hash(char *ao_hash, struct tcp_ao_key *ao_key, struct request_sock *req, const struct sk_buff *skb, int hash_offset, u32 sne); void tcp_ao_established(struct sock *sk); void tcp_ao_finish_connect(struct sock *sk, struct sk_buff *skb); void tcp_ao_connect_init(struct sock *sk); void tcp_ao_syncookie(struct sock *sk, const struct sk_buff *skb, struct request_sock *req, unsigned short int family); #else /* CONFIG_TCP_AO */ static inline int tcp_ao_transmit_skb(struct sock *sk, struct sk_buff *skb, struct tcp_ao_key *key, struct tcphdr *th, __u8 *hash_location) { return 0; } static inline void tcp_ao_syncookie(struct sock *sk, const struct sk_buff *skb, struct request_sock *req, unsigned short int family) { } static inline bool tcp_ao_ignore_icmp(const struct sock *sk, int family, int type, int code) { return false; } static inline enum skb_drop_reason tcp_inbound_ao_hash(struct sock *sk, const struct sk_buff *skb, unsigned short int family, const struct request_sock *req, int l3index, const struct tcp_ao_hdr *aoh) { return SKB_NOT_DROPPED_YET; } static inline struct tcp_ao_key *tcp_ao_do_lookup(const struct sock *sk, int l3index, const union tcp_ao_addr *addr, int family, int sndid, int rcvid) { return NULL; } static inline void tcp_ao_destroy_sock(struct sock *sk, bool twsk) { } static inline void tcp_ao_established(struct sock *sk) { } static inline void tcp_ao_finish_connect(struct sock *sk, struct sk_buff *skb) { } static inline void tcp_ao_time_wait(struct tcp_timewait_sock *tcptw, struct tcp_sock *tp) { } static inline void tcp_ao_connect_init(struct sock *sk) { } static inline int tcp_ao_get_mkts(struct sock *sk, sockptr_t optval, sockptr_t optlen) { return -ENOPROTOOPT; } static inline int tcp_ao_get_sock_info(struct sock *sk, sockptr_t optval, sockptr_t optlen) { return -ENOPROTOOPT; } static inline int tcp_ao_get_repair(struct sock *sk, sockptr_t optval, sockptr_t optlen) { return -ENOPROTOOPT; } static inline int tcp_ao_set_repair(struct sock *sk, sockptr_t optval, unsigned int optlen) { return -ENOPROTOOPT; } #endif #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) int tcp_do_parse_auth_options(const struct tcphdr *th, const u8 **md5_hash, const u8 **ao_hash); #else static inline int tcp_do_parse_auth_options(const struct tcphdr *th, const u8 **md5_hash, const u8 **ao_hash) { *md5_hash = NULL; *ao_hash = NULL; return 0; } #endif #endif /* _TCP_AO_H */ |
8 7 1 7 6 5 3 2 1 2 2 2 4 3 2 10 10 8 8 8 8 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 | // SPDX-License-Identifier: GPL-2.0-only #include "netlink.h" #include "common.h" struct pause_req_info { struct ethnl_req_info base; enum ethtool_mac_stats_src src; }; #define PAUSE_REQINFO(__req_base) \ container_of(__req_base, struct pause_req_info, base) struct pause_reply_data { struct ethnl_reply_data base; struct ethtool_pauseparam pauseparam; struct ethtool_pause_stats pausestat; }; #define PAUSE_REPDATA(__reply_base) \ container_of(__reply_base, struct pause_reply_data, base) const struct nla_policy ethnl_pause_get_policy[] = { [ETHTOOL_A_PAUSE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy_stats), [ETHTOOL_A_PAUSE_STATS_SRC] = NLA_POLICY_MAX(NLA_U32, ETHTOOL_MAC_STATS_SRC_PMAC), }; static int pause_parse_request(struct ethnl_req_info *req_base, struct nlattr **tb, struct netlink_ext_ack *extack) { enum ethtool_mac_stats_src src = ETHTOOL_MAC_STATS_SRC_AGGREGATE; struct pause_req_info *req_info = PAUSE_REQINFO(req_base); if (tb[ETHTOOL_A_PAUSE_STATS_SRC]) { if (!(req_base->flags & ETHTOOL_FLAG_STATS)) { NL_SET_ERR_MSG_MOD(extack, "ETHTOOL_FLAG_STATS must be set when requesting a source of stats"); return -EINVAL; } src = nla_get_u32(tb[ETHTOOL_A_PAUSE_STATS_SRC]); } req_info->src = src; return 0; } static int pause_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { const struct pause_req_info *req_info = PAUSE_REQINFO(req_base); struct pause_reply_data *data = PAUSE_REPDATA(reply_base); enum ethtool_mac_stats_src src = req_info->src; struct net_device *dev = reply_base->dev; int ret; if (!dev->ethtool_ops->get_pauseparam) return -EOPNOTSUPP; ethtool_stats_init((u64 *)&data->pausestat, sizeof(data->pausestat) / 8); data->pausestat.src = src; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; if ((src == ETHTOOL_MAC_STATS_SRC_EMAC || src == ETHTOOL_MAC_STATS_SRC_PMAC) && !__ethtool_dev_mm_supported(dev)) { NL_SET_ERR_MSG_MOD(info->extack, "Device does not support MAC merge layer"); ethnl_ops_complete(dev); return -EOPNOTSUPP; } dev->ethtool_ops->get_pauseparam(dev, &data->pauseparam); if (req_base->flags & ETHTOOL_FLAG_STATS && dev->ethtool_ops->get_pause_stats) dev->ethtool_ops->get_pause_stats(dev, &data->pausestat); ethnl_ops_complete(dev); return 0; } static int pause_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { int n = nla_total_size(sizeof(u8)) + /* _PAUSE_AUTONEG */ nla_total_size(sizeof(u8)) + /* _PAUSE_RX */ nla_total_size(sizeof(u8)); /* _PAUSE_TX */ if (req_base->flags & ETHTOOL_FLAG_STATS) n += nla_total_size(0) + /* _PAUSE_STATS */ nla_total_size(sizeof(u32)) + /* _PAUSE_STATS_SRC */ nla_total_size_64bit(sizeof(u64)) * ETHTOOL_PAUSE_STAT_CNT; return n; } static int ethtool_put_stat(struct sk_buff *skb, u64 val, u16 attrtype, u16 padtype) { if (val == ETHTOOL_STAT_NOT_SET) return 0; if (nla_put_u64_64bit(skb, attrtype, val, padtype)) return -EMSGSIZE; return 0; } static int pause_put_stats(struct sk_buff *skb, const struct ethtool_pause_stats *pause_stats) { const u16 pad = ETHTOOL_A_PAUSE_STAT_PAD; struct nlattr *nest; if (nla_put_u32(skb, ETHTOOL_A_PAUSE_STATS_SRC, pause_stats->src)) return -EMSGSIZE; nest = nla_nest_start(skb, ETHTOOL_A_PAUSE_STATS); if (!nest) return -EMSGSIZE; if (ethtool_put_stat(skb, pause_stats->tx_pause_frames, ETHTOOL_A_PAUSE_STAT_TX_FRAMES, pad) || ethtool_put_stat(skb, pause_stats->rx_pause_frames, ETHTOOL_A_PAUSE_STAT_RX_FRAMES, pad)) goto err_cancel; nla_nest_end(skb, nest); return 0; err_cancel: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int pause_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct pause_reply_data *data = PAUSE_REPDATA(reply_base); const struct ethtool_pauseparam *pauseparam = &data->pauseparam; if (nla_put_u8(skb, ETHTOOL_A_PAUSE_AUTONEG, !!pauseparam->autoneg) || nla_put_u8(skb, ETHTOOL_A_PAUSE_RX, !!pauseparam->rx_pause) || nla_put_u8(skb, ETHTOOL_A_PAUSE_TX, !!pauseparam->tx_pause)) return -EMSGSIZE; if (req_base->flags & ETHTOOL_FLAG_STATS && pause_put_stats(skb, &data->pausestat)) return -EMSGSIZE; return 0; } /* PAUSE_SET */ const struct nla_policy ethnl_pause_set_policy[] = { [ETHTOOL_A_PAUSE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_PAUSE_AUTONEG] = { .type = NLA_U8 }, [ETHTOOL_A_PAUSE_RX] = { .type = NLA_U8 }, [ETHTOOL_A_PAUSE_TX] = { .type = NLA_U8 }, }; static int ethnl_set_pause_validate(struct ethnl_req_info *req_info, struct genl_info *info) { const struct ethtool_ops *ops = req_info->dev->ethtool_ops; return ops->get_pauseparam && ops->set_pauseparam ? 1 : -EOPNOTSUPP; } static int ethnl_set_pause(struct ethnl_req_info *req_info, struct genl_info *info) { struct net_device *dev = req_info->dev; struct ethtool_pauseparam params = {}; struct nlattr **tb = info->attrs; bool mod = false; int ret; dev->ethtool_ops->get_pauseparam(dev, ¶ms); ethnl_update_bool32(¶ms.autoneg, tb[ETHTOOL_A_PAUSE_AUTONEG], &mod); ethnl_update_bool32(¶ms.rx_pause, tb[ETHTOOL_A_PAUSE_RX], &mod); ethnl_update_bool32(¶ms.tx_pause, tb[ETHTOOL_A_PAUSE_TX], &mod); if (!mod) return 0; ret = dev->ethtool_ops->set_pauseparam(dev, ¶ms); return ret < 0 ? ret : 1; } const struct ethnl_request_ops ethnl_pause_request_ops = { .request_cmd = ETHTOOL_MSG_PAUSE_GET, .reply_cmd = ETHTOOL_MSG_PAUSE_GET_REPLY, .hdr_attr = ETHTOOL_A_PAUSE_HEADER, .req_info_size = sizeof(struct pause_req_info), .reply_data_size = sizeof(struct pause_reply_data), .parse_request = pause_parse_request, .prepare_data = pause_prepare_data, .reply_size = pause_reply_size, .fill_reply = pause_fill_reply, .set_validate = ethnl_set_pause_validate, .set = ethnl_set_pause, .set_ntf_cmd = ETHTOOL_MSG_PAUSE_NTF, }; |
1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 | // SPDX-License-Identifier: GPL-2.0-only /* * * Copyright (C) 2005 Mike Isely <isely@pobox.com> */ #include "pvrusb2-std.h" #include "pvrusb2-debug.h" #include <asm/string.h> #include <linux/slab.h> struct std_name { const char *name; v4l2_std_id id; }; #define CSTD_PAL \ (V4L2_STD_PAL_B| \ V4L2_STD_PAL_B1| \ V4L2_STD_PAL_G| \ V4L2_STD_PAL_H| \ V4L2_STD_PAL_I| \ V4L2_STD_PAL_D| \ V4L2_STD_PAL_D1| \ V4L2_STD_PAL_K| \ V4L2_STD_PAL_M| \ V4L2_STD_PAL_N| \ V4L2_STD_PAL_Nc| \ V4L2_STD_PAL_60) #define CSTD_NTSC \ (V4L2_STD_NTSC_M| \ V4L2_STD_NTSC_M_JP| \ V4L2_STD_NTSC_M_KR| \ V4L2_STD_NTSC_443) #define CSTD_ATSC \ (V4L2_STD_ATSC_8_VSB| \ V4L2_STD_ATSC_16_VSB) #define CSTD_SECAM \ (V4L2_STD_SECAM_B| \ V4L2_STD_SECAM_D| \ V4L2_STD_SECAM_G| \ V4L2_STD_SECAM_H| \ V4L2_STD_SECAM_K| \ V4L2_STD_SECAM_K1| \ V4L2_STD_SECAM_L| \ V4L2_STD_SECAM_LC) #define TSTD_B (V4L2_STD_PAL_B|V4L2_STD_SECAM_B) #define TSTD_B1 (V4L2_STD_PAL_B1) #define TSTD_D (V4L2_STD_PAL_D|V4L2_STD_SECAM_D) #define TSTD_D1 (V4L2_STD_PAL_D1) #define TSTD_G (V4L2_STD_PAL_G|V4L2_STD_SECAM_G) #define TSTD_H (V4L2_STD_PAL_H|V4L2_STD_SECAM_H) #define TSTD_I (V4L2_STD_PAL_I) #define TSTD_K (V4L2_STD_PAL_K|V4L2_STD_SECAM_K) #define TSTD_K1 (V4L2_STD_SECAM_K1) #define TSTD_L (V4L2_STD_SECAM_L) #define TSTD_M (V4L2_STD_PAL_M|V4L2_STD_NTSC_M) #define TSTD_N (V4L2_STD_PAL_N) #define TSTD_Nc (V4L2_STD_PAL_Nc) #define TSTD_60 (V4L2_STD_PAL_60) #define CSTD_ALL (CSTD_PAL|CSTD_NTSC|CSTD_ATSC|CSTD_SECAM) /* Mapping of standard bits to color system */ static const struct std_name std_groups[] = { {"PAL",CSTD_PAL}, {"NTSC",CSTD_NTSC}, {"SECAM",CSTD_SECAM}, {"ATSC",CSTD_ATSC}, }; /* Mapping of standard bits to modulation system */ static const struct std_name std_items[] = { {"B",TSTD_B}, {"B1",TSTD_B1}, {"D",TSTD_D}, {"D1",TSTD_D1}, {"G",TSTD_G}, {"H",TSTD_H}, {"I",TSTD_I}, {"K",TSTD_K}, {"K1",TSTD_K1}, {"L",TSTD_L}, {"LC",V4L2_STD_SECAM_LC}, {"M",TSTD_M}, {"Mj",V4L2_STD_NTSC_M_JP}, {"443",V4L2_STD_NTSC_443}, {"Mk",V4L2_STD_NTSC_M_KR}, {"N",TSTD_N}, {"Nc",TSTD_Nc}, {"60",TSTD_60}, {"8VSB",V4L2_STD_ATSC_8_VSB}, {"16VSB",V4L2_STD_ATSC_16_VSB}, }; // Search an array of std_name structures and return a pointer to the // element with the matching name. static const struct std_name *find_std_name(const struct std_name *arrPtr, unsigned int arrSize, const char *bufPtr, unsigned int bufSize) { unsigned int idx; const struct std_name *p; for (idx = 0; idx < arrSize; idx++) { p = arrPtr + idx; if (strlen(p->name) != bufSize) continue; if (!memcmp(bufPtr,p->name,bufSize)) return p; } return NULL; } int pvr2_std_str_to_id(v4l2_std_id *idPtr,const char *bufPtr, unsigned int bufSize) { v4l2_std_id id = 0; v4l2_std_id cmsk = 0; v4l2_std_id t; int mMode = 0; unsigned int cnt; char ch; const struct std_name *sp; while (bufSize) { if (!mMode) { cnt = 0; while ((cnt < bufSize) && (bufPtr[cnt] != '-')) cnt++; if (cnt >= bufSize) return 0; // No more characters sp = find_std_name(std_groups, ARRAY_SIZE(std_groups), bufPtr,cnt); if (!sp) return 0; // Illegal color system name cnt++; bufPtr += cnt; bufSize -= cnt; mMode = !0; cmsk = sp->id; continue; } cnt = 0; while (cnt < bufSize) { ch = bufPtr[cnt]; if (ch == ';') { mMode = 0; break; } if (ch == '/') break; cnt++; } sp = find_std_name(std_items, ARRAY_SIZE(std_items), bufPtr,cnt); if (!sp) return 0; // Illegal modulation system ID t = sp->id & cmsk; if (!t) return 0; // Specific color + modulation system illegal id |= t; if (cnt < bufSize) cnt++; bufPtr += cnt; bufSize -= cnt; } if (idPtr) *idPtr = id; return !0; } unsigned int pvr2_std_id_to_str(char *bufPtr, unsigned int bufSize, v4l2_std_id id) { unsigned int idx1,idx2; const struct std_name *ip,*gp; int gfl,cfl; unsigned int c1,c2; cfl = 0; c1 = 0; for (idx1 = 0; idx1 < ARRAY_SIZE(std_groups); idx1++) { gp = std_groups + idx1; gfl = 0; for (idx2 = 0; idx2 < ARRAY_SIZE(std_items); idx2++) { ip = std_items + idx2; if (!(gp->id & ip->id & id)) continue; if (!gfl) { if (cfl) { c2 = scnprintf(bufPtr,bufSize,";"); c1 += c2; bufSize -= c2; bufPtr += c2; } cfl = !0; c2 = scnprintf(bufPtr,bufSize, "%s-",gp->name); gfl = !0; } else { c2 = scnprintf(bufPtr,bufSize,"/"); } c1 += c2; bufSize -= c2; bufPtr += c2; c2 = scnprintf(bufPtr,bufSize, ip->name); c1 += c2; bufSize -= c2; bufPtr += c2; } } return c1; } // Template data for possible enumerated video standards. Here we group // standards which share common frame rates and resolution. static struct v4l2_standard generic_standards[] = { { .id = (TSTD_B|TSTD_B1| TSTD_D|TSTD_D1| TSTD_G| TSTD_H| TSTD_I| TSTD_K|TSTD_K1| TSTD_L| V4L2_STD_SECAM_LC | TSTD_N|TSTD_Nc), .frameperiod = { .numerator = 1, .denominator= 25 }, .framelines = 625, .reserved = {0,0,0,0} }, { .id = (TSTD_M| V4L2_STD_NTSC_M_JP| V4L2_STD_NTSC_M_KR), .frameperiod = { .numerator = 1001, .denominator= 30000 }, .framelines = 525, .reserved = {0,0,0,0} }, { // This is a total wild guess .id = (TSTD_60), .frameperiod = { .numerator = 1001, .denominator= 30000 }, .framelines = 525, .reserved = {0,0,0,0} }, { // This is total wild guess .id = V4L2_STD_NTSC_443, .frameperiod = { .numerator = 1001, .denominator= 30000 }, .framelines = 525, .reserved = {0,0,0,0} } }; static struct v4l2_standard *match_std(v4l2_std_id id) { unsigned int idx; for (idx = 0; idx < ARRAY_SIZE(generic_standards); idx++) { if (generic_standards[idx].id & id) { return generic_standards + idx; } } return NULL; } static int pvr2_std_fill(struct v4l2_standard *std,v4l2_std_id id) { struct v4l2_standard *template; int idx; unsigned int bcnt; template = match_std(id); if (!template) return 0; idx = std->index; memcpy(std,template,sizeof(*template)); std->index = idx; std->id = id; bcnt = pvr2_std_id_to_str(std->name,sizeof(std->name)-1,id); std->name[bcnt] = 0; pvr2_trace(PVR2_TRACE_STD,"Set up standard idx=%u name=%s", std->index,std->name); return !0; } /* These are special cases of combined standards that we should enumerate separately if the component pieces are present. */ static v4l2_std_id std_mixes[] = { V4L2_STD_PAL_B | V4L2_STD_PAL_G, V4L2_STD_PAL_D | V4L2_STD_PAL_K, V4L2_STD_SECAM_B | V4L2_STD_SECAM_G, V4L2_STD_SECAM_D | V4L2_STD_SECAM_K, }; struct v4l2_standard *pvr2_std_create_enum(unsigned int *countptr, v4l2_std_id id) { unsigned int std_cnt = 0; unsigned int idx,bcnt,idx2; v4l2_std_id idmsk,cmsk,fmsk; struct v4l2_standard *stddefs; if (pvrusb2_debug & PVR2_TRACE_STD) { char buf[100]; bcnt = pvr2_std_id_to_str(buf,sizeof(buf),id); pvr2_trace( PVR2_TRACE_STD,"Mapping standards mask=0x%x (%.*s)", (int)id,bcnt,buf); } *countptr = 0; std_cnt = 0; fmsk = 0; for (idmsk = 1, cmsk = id; cmsk; idmsk <<= 1) { if (!(idmsk & cmsk)) continue; cmsk &= ~idmsk; if (match_std(idmsk)) { std_cnt++; continue; } fmsk |= idmsk; } for (idx2 = 0; idx2 < ARRAY_SIZE(std_mixes); idx2++) { if ((id & std_mixes[idx2]) == std_mixes[idx2]) std_cnt++; } /* Don't complain about ATSC standard values */ fmsk &= ~CSTD_ATSC; if (fmsk) { char buf[100]; bcnt = pvr2_std_id_to_str(buf,sizeof(buf),fmsk); pvr2_trace( PVR2_TRACE_ERROR_LEGS, "***WARNING*** Failed to classify the following standard(s): %.*s", bcnt,buf); } pvr2_trace(PVR2_TRACE_STD,"Setting up %u unique standard(s)", std_cnt); if (!std_cnt) return NULL; // paranoia stddefs = kcalloc(std_cnt, sizeof(struct v4l2_standard), GFP_KERNEL); if (!stddefs) return NULL; for (idx = 0; idx < std_cnt; idx++) stddefs[idx].index = idx; idx = 0; /* Enumerate potential special cases */ for (idx2 = 0; (idx2 < ARRAY_SIZE(std_mixes)) && (idx < std_cnt); idx2++) { if (!(id & std_mixes[idx2])) continue; if (pvr2_std_fill(stddefs+idx,std_mixes[idx2])) idx++; } /* Now enumerate individual pieces */ for (idmsk = 1, cmsk = id; cmsk && (idx < std_cnt); idmsk <<= 1) { if (!(idmsk & cmsk)) continue; cmsk &= ~idmsk; if (!pvr2_std_fill(stddefs+idx,idmsk)) continue; idx++; } *countptr = std_cnt; return stddefs; } v4l2_std_id pvr2_std_get_usable(void) { return CSTD_ALL; } |
47 47 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 | /* SPDX-License-Identifier: GPL-2.0 */ /* Rewritten and vastly simplified by Rusty Russell for in-kernel * module loader: * Copyright 2002 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation */ #ifndef _LINUX_KALLSYMS_H #define _LINUX_KALLSYMS_H #include <linux/errno.h> #include <linux/buildid.h> #include <linux/kernel.h> #include <linux/stddef.h> #include <linux/mm.h> #include <linux/module.h> #include <asm/sections.h> #define KSYM_NAME_LEN 512 #define KSYM_SYMBOL_LEN (sizeof("%s+%#lx/%#lx [%s %s]") + \ (KSYM_NAME_LEN - 1) + \ 2*(BITS_PER_LONG*3/10) + (MODULE_NAME_LEN - 1) + \ (BUILD_ID_SIZE_MAX * 2) + 1) struct cred; struct module; static inline int is_kernel_text(unsigned long addr) { if (__is_kernel_text(addr)) return 1; return in_gate_area_no_mm(addr); } static inline int is_kernel(unsigned long addr) { if (__is_kernel(addr)) return 1; return in_gate_area_no_mm(addr); } static inline int is_ksym_addr(unsigned long addr) { if (IS_ENABLED(CONFIG_KALLSYMS_ALL)) return is_kernel(addr); return is_kernel_text(addr) || is_kernel_inittext(addr); } static inline void *dereference_symbol_descriptor(void *ptr) { #ifdef CONFIG_HAVE_FUNCTION_DESCRIPTORS struct module *mod; ptr = dereference_kernel_function_descriptor(ptr); if (is_ksym_addr((unsigned long)ptr)) return ptr; preempt_disable(); mod = __module_address((unsigned long)ptr); preempt_enable(); if (mod) ptr = dereference_module_function_descriptor(mod, ptr); #endif return ptr; } /* How and when do we show kallsyms values? */ extern bool kallsyms_show_value(const struct cred *cred); #ifdef CONFIG_KALLSYMS unsigned long kallsyms_sym_address(int idx); int kallsyms_on_each_symbol(int (*fn)(void *, const char *, unsigned long), void *data); int kallsyms_on_each_match_symbol(int (*fn)(void *, unsigned long), const char *name, void *data); /* Lookup the address for a symbol. Returns 0 if not found. */ unsigned long kallsyms_lookup_name(const char *name); extern int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize, unsigned long *offset); /* Lookup an address. modname is set to NULL if it's in the kernel. */ const char *kallsyms_lookup(unsigned long addr, unsigned long *symbolsize, unsigned long *offset, char **modname, char *namebuf); /* Look up a kernel symbol and return it in a text buffer. */ extern int sprint_symbol(char *buffer, unsigned long address); extern int sprint_symbol_build_id(char *buffer, unsigned long address); extern int sprint_symbol_no_offset(char *buffer, unsigned long address); extern int sprint_backtrace(char *buffer, unsigned long address); extern int sprint_backtrace_build_id(char *buffer, unsigned long address); int lookup_symbol_name(unsigned long addr, char *symname); #else /* !CONFIG_KALLSYMS */ static inline unsigned long kallsyms_lookup_name(const char *name) { return 0; } static inline int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize, unsigned long *offset) { return 0; } static inline const char *kallsyms_lookup(unsigned long addr, unsigned long *symbolsize, unsigned long *offset, char **modname, char *namebuf) { return NULL; } static inline int sprint_symbol(char *buffer, unsigned long addr) { *buffer = '\0'; return 0; } static inline int sprint_symbol_build_id(char *buffer, unsigned long address) { *buffer = '\0'; return 0; } static inline int sprint_symbol_no_offset(char *buffer, unsigned long addr) { *buffer = '\0'; return 0; } static inline int sprint_backtrace(char *buffer, unsigned long addr) { *buffer = '\0'; return 0; } static inline int sprint_backtrace_build_id(char *buffer, unsigned long addr) { *buffer = '\0'; return 0; } static inline int lookup_symbol_name(unsigned long addr, char *symname) { return -ERANGE; } static inline int kallsyms_on_each_symbol(int (*fn)(void *, const char *, unsigned long), void *data) { return -EOPNOTSUPP; } static inline int kallsyms_on_each_match_symbol(int (*fn)(void *, unsigned long), const char *name, void *data) { return -EOPNOTSUPP; } #endif /*CONFIG_KALLSYMS*/ static inline void print_ip_sym(const char *loglvl, unsigned long ip) { printk("%s[<%px>] %pS\n", loglvl, (void *) ip, (void *) ip); } #endif /*_LINUX_KALLSYMS_H*/ |
268 268 268 3435 3431 55 3391 3505 3265 263 263 263 3031 494 496 493 496 494 323 319 147 148 148 148 148 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1995 Linus Torvalds * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs. * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar */ #include <linux/sched.h> /* test_thread_flag(), ... */ #include <linux/sched/task_stack.h> /* task_stack_*(), ... */ #include <linux/kdebug.h> /* oops_begin/end, ... */ #include <linux/extable.h> /* search_exception_tables */ #include <linux/memblock.h> /* max_low_pfn */ #include <linux/kfence.h> /* kfence_handle_page_fault */ #include <linux/kprobes.h> /* NOKPROBE_SYMBOL, ... */ #include <linux/mmiotrace.h> /* kmmio_handler, ... */ #include <linux/perf_event.h> /* perf_sw_event */ #include <linux/hugetlb.h> /* hstate_index_to_shift */ #include <linux/prefetch.h> /* prefetchw */ #include <linux/context_tracking.h> /* exception_enter(), ... */ #include <linux/uaccess.h> /* faulthandler_disabled() */ #include <linux/efi.h> /* efi_crash_gracefully_on_page_fault()*/ #include <linux/mm_types.h> #include <linux/mm.h> /* find_and_lock_vma() */ #include <asm/cpufeature.h> /* boot_cpu_has, ... */ #include <asm/traps.h> /* dotraplinkage, ... */ #include <asm/fixmap.h> /* VSYSCALL_ADDR */ #include <asm/vsyscall.h> /* emulate_vsyscall */ #include <asm/vm86.h> /* struct vm86 */ #include <asm/mmu_context.h> /* vma_pkey() */ #include <asm/efi.h> /* efi_crash_gracefully_on_page_fault()*/ #include <asm/desc.h> /* store_idt(), ... */ #include <asm/cpu_entry_area.h> /* exception stack */ #include <asm/pgtable_areas.h> /* VMALLOC_START, ... */ #include <asm/kvm_para.h> /* kvm_handle_async_pf */ #include <asm/vdso.h> /* fixup_vdso_exception() */ #include <asm/irq_stack.h> #define CREATE_TRACE_POINTS #include <asm/trace/exceptions.h> /* * Returns 0 if mmiotrace is disabled, or if the fault is not * handled by mmiotrace: */ static nokprobe_inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) { if (unlikely(is_kmmio_active())) if (kmmio_handler(regs, addr) == 1) return -1; return 0; } /* * Prefetch quirks: * * 32-bit mode: * * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. * Check that here and ignore it. This is AMD erratum #91. * * 64-bit mode: * * Sometimes the CPU reports invalid exceptions on prefetch. * Check that here and ignore it. * * Opcode checker based on code by Richard Brunner. */ static inline int check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr, unsigned char opcode, int *prefetch) { unsigned char instr_hi = opcode & 0xf0; unsigned char instr_lo = opcode & 0x0f; switch (instr_hi) { case 0x20: case 0x30: /* * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. * In X86_64 long mode, the CPU will signal invalid * opcode if some of these prefixes are present so * X86_64 will never get here anyway */ return ((instr_lo & 7) == 0x6); #ifdef CONFIG_X86_64 case 0x40: /* * In 64-bit mode 0x40..0x4F are valid REX prefixes */ return (!user_mode(regs) || user_64bit_mode(regs)); #endif case 0x60: /* 0x64 thru 0x67 are valid prefixes in all modes. */ return (instr_lo & 0xC) == 0x4; case 0xF0: /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */ return !instr_lo || (instr_lo>>1) == 1; case 0x00: /* Prefetch instruction is 0x0F0D or 0x0F18 */ if (get_kernel_nofault(opcode, instr)) return 0; *prefetch = (instr_lo == 0xF) && (opcode == 0x0D || opcode == 0x18); return 0; default: return 0; } } static bool is_amd_k8_pre_npt(void) { struct cpuinfo_x86 *c = &boot_cpu_data; return unlikely(IS_ENABLED(CONFIG_CPU_SUP_AMD) && c->x86_vendor == X86_VENDOR_AMD && c->x86 == 0xf && c->x86_model < 0x40); } static int is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) { unsigned char *max_instr; unsigned char *instr; int prefetch = 0; /* Erratum #91 affects AMD K8, pre-NPT CPUs */ if (!is_amd_k8_pre_npt()) return 0; /* * If it was a exec (instruction fetch) fault on NX page, then * do not ignore the fault: */ if (error_code & X86_PF_INSTR) return 0; instr = (void *)convert_ip_to_linear(current, regs); max_instr = instr + 15; /* * This code has historically always bailed out if IP points to a * not-present page (e.g. due to a race). No one has ever * complained about this. */ pagefault_disable(); while (instr < max_instr) { unsigned char opcode; if (user_mode(regs)) { if (get_user(opcode, (unsigned char __user *) instr)) break; } else { if (get_kernel_nofault(opcode, instr)) break; } instr++; if (!check_prefetch_opcode(regs, instr, opcode, &prefetch)) break; } pagefault_enable(); return prefetch; } DEFINE_SPINLOCK(pgd_lock); LIST_HEAD(pgd_list); #ifdef CONFIG_X86_32 static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) { unsigned index = pgd_index(address); pgd_t *pgd_k; p4d_t *p4d, *p4d_k; pud_t *pud, *pud_k; pmd_t *pmd, *pmd_k; pgd += index; pgd_k = init_mm.pgd + index; if (!pgd_present(*pgd_k)) return NULL; /* * set_pgd(pgd, *pgd_k); here would be useless on PAE * and redundant with the set_pmd() on non-PAE. As would * set_p4d/set_pud. */ p4d = p4d_offset(pgd, address); p4d_k = p4d_offset(pgd_k, address); if (!p4d_present(*p4d_k)) return NULL; pud = pud_offset(p4d, address); pud_k = pud_offset(p4d_k, address); if (!pud_present(*pud_k)) return NULL; pmd = pmd_offset(pud, address); pmd_k = pmd_offset(pud_k, address); if (pmd_present(*pmd) != pmd_present(*pmd_k)) set_pmd(pmd, *pmd_k); if (!pmd_present(*pmd_k)) return NULL; else BUG_ON(pmd_pfn(*pmd) != pmd_pfn(*pmd_k)); return pmd_k; } /* * Handle a fault on the vmalloc or module mapping area * * This is needed because there is a race condition between the time * when the vmalloc mapping code updates the PMD to the point in time * where it synchronizes this update with the other page-tables in the * system. * * In this race window another thread/CPU can map an area on the same * PMD, finds it already present and does not synchronize it with the * rest of the system yet. As a result v[mz]alloc might return areas * which are not mapped in every page-table in the system, causing an * unhandled page-fault when they are accessed. */ static noinline int vmalloc_fault(unsigned long address) { unsigned long pgd_paddr; pmd_t *pmd_k; pte_t *pte_k; /* Make sure we are in vmalloc area: */ if (!(address >= VMALLOC_START && address < VMALLOC_END)) return -1; /* * Synchronize this task's top level page-table * with the 'reference' page table. * * Do _not_ use "current" here. We might be inside * an interrupt in the middle of a task switch.. */ pgd_paddr = read_cr3_pa(); pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); if (!pmd_k) return -1; if (pmd_large(*pmd_k)) return 0; pte_k = pte_offset_kernel(pmd_k, address); if (!pte_present(*pte_k)) return -1; return 0; } NOKPROBE_SYMBOL(vmalloc_fault); void arch_sync_kernel_mappings(unsigned long start, unsigned long end) { unsigned long addr; for (addr = start & PMD_MASK; addr >= TASK_SIZE_MAX && addr < VMALLOC_END; addr += PMD_SIZE) { struct page *page; spin_lock(&pgd_lock); list_for_each_entry(page, &pgd_list, lru) { spinlock_t *pgt_lock; /* the pgt_lock only for Xen */ pgt_lock = &pgd_page_get_mm(page)->page_table_lock; spin_lock(pgt_lock); vmalloc_sync_one(page_address(page), addr); spin_unlock(pgt_lock); } spin_unlock(&pgd_lock); } } static bool low_pfn(unsigned long pfn) { return pfn < max_low_pfn; } static void dump_pagetable(unsigned long address) { pgd_t *base = __va(read_cr3_pa()); pgd_t *pgd = &base[pgd_index(address)]; p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; #ifdef CONFIG_X86_PAE pr_info("*pdpt = %016Lx ", pgd_val(*pgd)); if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd)) goto out; #define pr_pde pr_cont #else #define pr_pde pr_info #endif p4d = p4d_offset(pgd, address); pud = pud_offset(p4d, address); pmd = pmd_offset(pud, address); pr_pde("*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd)); #undef pr_pde /* * We must not directly access the pte in the highpte * case if the page table is located in highmem. * And let's rather not kmap-atomic the pte, just in case * it's allocated already: */ if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_large(*pmd)) goto out; pte = pte_offset_kernel(pmd, address); pr_cont("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte)); out: pr_cont("\n"); } #else /* CONFIG_X86_64: */ #ifdef CONFIG_CPU_SUP_AMD static const char errata93_warning[] = KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" "******* Working around it, but it may cause SEGVs or burn power.\n" "******* Please consider a BIOS update.\n" "******* Disabling USB legacy in the BIOS may also help.\n"; #endif static int bad_address(void *p) { unsigned long dummy; return get_kernel_nofault(dummy, (unsigned long *)p); } static void dump_pagetable(unsigned long address) { pgd_t *base = __va(read_cr3_pa()); pgd_t *pgd = base + pgd_index(address); p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; if (bad_address(pgd)) goto bad; pr_info("PGD %lx ", pgd_val(*pgd)); if (!pgd_present(*pgd)) goto out; p4d = p4d_offset(pgd, address); if (bad_address(p4d)) goto bad; pr_cont("P4D %lx ", p4d_val(*p4d)); if (!p4d_present(*p4d) || p4d_large(*p4d)) goto out; pud = pud_offset(p4d, address); if (bad_address(pud)) goto bad; pr_cont("PUD %lx ", pud_val(*pud)); if (!pud_present(*pud) || pud_large(*pud)) goto out; pmd = pmd_offset(pud, address); if (bad_address(pmd)) goto bad; pr_cont("PMD %lx ", pmd_val(*pmd)); if (!pmd_present(*pmd) || pmd_large(*pmd)) goto out; pte = pte_offset_kernel(pmd, address); if (bad_address(pte)) goto bad; pr_cont("PTE %lx", pte_val(*pte)); out: pr_cont("\n"); return; bad: pr_info("BAD\n"); } #endif /* CONFIG_X86_64 */ /* * Workaround for K8 erratum #93 & buggy BIOS. * * BIOS SMM functions are required to use a specific workaround * to avoid corruption of the 64bit RIP register on C stepping K8. * * A lot of BIOS that didn't get tested properly miss this. * * The OS sees this as a page fault with the upper 32bits of RIP cleared. * Try to work around it here. * * Note we only handle faults in kernel here. * Does nothing on 32-bit. */ static int is_errata93(struct pt_regs *regs, unsigned long address) { #if defined(CONFIG_X86_64) && defined(CONFIG_CPU_SUP_AMD) if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD || boot_cpu_data.x86 != 0xf) return 0; if (user_mode(regs)) return 0; if (address != regs->ip) return 0; if ((address >> 32) != 0) return 0; address |= 0xffffffffUL << 32; if ((address >= (u64)_stext && address <= (u64)_etext) || (address >= MODULES_VADDR && address <= MODULES_END)) { printk_once(errata93_warning); regs->ip = address; return 1; } #endif return 0; } /* * Work around K8 erratum #100 K8 in compat mode occasionally jumps * to illegal addresses >4GB. * * We catch this in the page fault handler because these addresses * are not reachable. Just detect this case and return. Any code * segment in LDT is compatibility mode. */ static int is_errata100(struct pt_regs *regs, unsigned long address) { #ifdef CONFIG_X86_64 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32)) return 1; #endif return 0; } /* Pentium F0 0F C7 C8 bug workaround: */ static int is_f00f_bug(struct pt_regs *regs, unsigned long error_code, unsigned long address) { #ifdef CONFIG_X86_F00F_BUG if (boot_cpu_has_bug(X86_BUG_F00F) && !(error_code & X86_PF_USER) && idt_is_f00f_address(address)) { handle_invalid_op(regs); return 1; } #endif return 0; } static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index) { u32 offset = (index >> 3) * sizeof(struct desc_struct); unsigned long addr; struct ldttss_desc desc; if (index == 0) { pr_alert("%s: NULL\n", name); return; } if (offset + sizeof(struct ldttss_desc) >= gdt->size) { pr_alert("%s: 0x%hx -- out of bounds\n", name, index); return; } if (copy_from_kernel_nofault(&desc, (void *)(gdt->address + offset), sizeof(struct ldttss_desc))) { pr_alert("%s: 0x%hx -- GDT entry is not readable\n", name, index); return; } addr = desc.base0 | (desc.base1 << 16) | ((unsigned long)desc.base2 << 24); #ifdef CONFIG_X86_64 addr |= ((u64)desc.base3 << 32); #endif pr_alert("%s: 0x%hx -- base=0x%lx limit=0x%x\n", name, index, addr, (desc.limit0 | (desc.limit1 << 16))); } static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address) { if (!oops_may_print()) return; if (error_code & X86_PF_INSTR) { unsigned int level; pgd_t *pgd; pte_t *pte; pgd = __va(read_cr3_pa()); pgd += pgd_index(address); pte = lookup_address_in_pgd(pgd, address, &level); if (pte && pte_present(*pte) && !pte_exec(*pte)) pr_crit("kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n", from_kuid(&init_user_ns, current_uid())); if (pte && pte_present(*pte) && pte_exec(*pte) && (pgd_flags(*pgd) & _PAGE_USER) && (__read_cr4() & X86_CR4_SMEP)) pr_crit("unable to execute userspace code (SMEP?) (uid: %d)\n", from_kuid(&init_user_ns, current_uid())); } if (address < PAGE_SIZE && !user_mode(regs)) pr_alert("BUG: kernel NULL pointer dereference, address: %px\n", (void *)address); else pr_alert("BUG: unable to handle page fault for address: %px\n", (void *)address); pr_alert("#PF: %s %s in %s mode\n", (error_code & X86_PF_USER) ? "user" : "supervisor", (error_code & X86_PF_INSTR) ? "instruction fetch" : (error_code & X86_PF_WRITE) ? "write access" : "read access", user_mode(regs) ? "user" : "kernel"); pr_alert("#PF: error_code(0x%04lx) - %s\n", error_code, !(error_code & X86_PF_PROT) ? "not-present page" : (error_code & X86_PF_RSVD) ? "reserved bit violation" : (error_code & X86_PF_PK) ? "protection keys violation" : "permissions violation"); if (!(error_code & X86_PF_USER) && user_mode(regs)) { struct desc_ptr idt, gdt; u16 ldtr, tr; /* * This can happen for quite a few reasons. The more obvious * ones are faults accessing the GDT, or LDT. Perhaps * surprisingly, if the CPU tries to deliver a benign or * contributory exception from user code and gets a page fault * during delivery, the page fault can be delivered as though * it originated directly from user code. This could happen * due to wrong permissions on the IDT, GDT, LDT, TSS, or * kernel or IST stack. */ store_idt(&idt); /* Usable even on Xen PV -- it's just slow. */ native_store_gdt(&gdt); pr_alert("IDT: 0x%lx (limit=0x%hx) GDT: 0x%lx (limit=0x%hx)\n", idt.address, idt.size, gdt.address, gdt.size); store_ldt(ldtr); show_ldttss(&gdt, "LDTR", ldtr); store_tr(tr); show_ldttss(&gdt, "TR", tr); } dump_pagetable(address); } static noinline void pgtable_bad(struct pt_regs *regs, unsigned long error_code, unsigned long address) { struct task_struct *tsk; unsigned long flags; int sig; flags = oops_begin(); tsk = current; sig = SIGKILL; printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", tsk->comm, address); dump_pagetable(address); if (__die("Bad pagetable", regs, error_code)) sig = 0; oops_end(flags, regs, sig); } static void sanitize_error_code(unsigned long address, unsigned long *error_code) { /* * To avoid leaking information about the kernel page * table layout, pretend that user-mode accesses to * kernel addresses are always protection faults. * * NB: This means that failed vsyscalls with vsyscall=none * will have the PROT bit. This doesn't leak any * information and does not appear to cause any problems. */ if (address >= TASK_SIZE_MAX) *error_code |= X86_PF_PROT; } static void set_signal_archinfo(unsigned long address, unsigned long error_code) { struct task_struct *tsk = current; tsk->thread.trap_nr = X86_TRAP_PF; tsk->thread.error_code = error_code | X86_PF_USER; tsk->thread.cr2 = address; } static noinline void page_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address) { #ifdef CONFIG_VMAP_STACK struct stack_info info; #endif unsigned long flags; int sig; if (user_mode(regs)) { /* * Implicit kernel access from user mode? Skip the stack * overflow and EFI special cases. */ goto oops; } #ifdef CONFIG_VMAP_STACK /* * Stack overflow? During boot, we can fault near the initial * stack in the direct map, but that's not an overflow -- check * that we're in vmalloc space to avoid this. */ if (is_vmalloc_addr((void *)address) && get_stack_guard_info((void *)address, &info)) { /* * We're likely to be running with very little stack space * left. It's plausible that we'd hit this condition but * double-fault even before we get this far, in which case * we're fine: the double-fault handler will deal with it. * * We don't want to make it all the way into the oops code * and then double-fault, though, because we're likely to * break the console driver and lose most of the stack dump. */ call_on_stack(__this_cpu_ist_top_va(DF) - sizeof(void*), handle_stack_overflow, ASM_CALL_ARG3, , [arg1] "r" (regs), [arg2] "r" (address), [arg3] "r" (&info)); unreachable(); } #endif /* * Buggy firmware could access regions which might page fault. If * this happens, EFI has a special OOPS path that will try to * avoid hanging the system. */ if (IS_ENABLED(CONFIG_EFI)) efi_crash_gracefully_on_page_fault(address); /* Only not-present faults should be handled by KFENCE. */ if (!(error_code & X86_PF_PROT) && kfence_handle_page_fault(address, error_code & X86_PF_WRITE, regs)) return; oops: /* * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice: */ flags = oops_begin(); show_fault_oops(regs, error_code, address); if (task_stack_end_corrupted(current)) printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); sig = SIGKILL; if (__die("Oops", regs, error_code)) sig = 0; /* Executive summary in case the body of the oops scrolled away */ printk(KERN_DEFAULT "CR2: %016lx\n", address); oops_end(flags, regs, sig); } static noinline void kernelmode_fixup_or_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address, int signal, int si_code, u32 pkey) { WARN_ON_ONCE(user_mode(regs)); /* Are we prepared to handle this kernel fault? */ if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) { /* * Any interrupt that takes a fault gets the fixup. This makes * the below recursive fault logic only apply to a faults from * task context. */ if (in_interrupt()) return; /* * Per the above we're !in_interrupt(), aka. task context. * * In this case we need to make sure we're not recursively * faulting through the emulate_vsyscall() logic. */ if (current->thread.sig_on_uaccess_err && signal) { sanitize_error_code(address, &error_code); set_signal_archinfo(address, error_code); if (si_code == SEGV_PKUERR) { force_sig_pkuerr((void __user *)address, pkey); } else { /* XXX: hwpoison faults will set the wrong code. */ force_sig_fault(signal, si_code, (void __user *)address); } } /* * Barring that, we can do the fixup and be happy. */ return; } /* * AMD erratum #91 manifests as a spurious page fault on a PREFETCH * instruction. */ if (is_prefetch(regs, error_code, address)) return; page_fault_oops(regs, error_code, address); } /* * Print out info about fatal segfaults, if the show_unhandled_signals * sysctl is set: */ static inline void show_signal_msg(struct pt_regs *regs, unsigned long error_code, unsigned long address, struct task_struct *tsk) { const char *loglvl = task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG; /* This is a racy snapshot, but it's better than nothing. */ int cpu = raw_smp_processor_id(); if (!unhandled_signal(tsk, SIGSEGV)) return; if (!printk_ratelimit()) return; printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx", loglvl, tsk->comm, task_pid_nr(tsk), address, (void *)regs->ip, (void *)regs->sp, error_code); print_vma_addr(KERN_CONT " in ", regs->ip); /* * Dump the likely CPU where the fatal segfault happened. * This can help identify faulty hardware. */ printk(KERN_CONT " likely on CPU %d (core %d, socket %d)", cpu, topology_core_id(cpu), topology_physical_package_id(cpu)); printk(KERN_CONT "\n"); show_opcodes(regs, loglvl); } static void __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, unsigned long address, u32 pkey, int si_code) { struct task_struct *tsk = current; if (!user_mode(regs)) { kernelmode_fixup_or_oops(regs, error_code, address, SIGSEGV, si_code, pkey); return; } if (!(error_code & X86_PF_USER)) { /* Implicit user access to kernel memory -- just oops */ page_fault_oops(regs, error_code, address); return; } /* * User mode accesses just cause a SIGSEGV. * It's possible to have interrupts off here: */ local_irq_enable(); /* * Valid to do another page fault here because this one came * from user space: */ if (is_prefetch(regs, error_code, address)) return; if (is_errata100(regs, address)) return; sanitize_error_code(address, &error_code); if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address)) return; if (likely(show_unhandled_signals)) show_signal_msg(regs, error_code, address, tsk); set_signal_archinfo(address, error_code); if (si_code == SEGV_PKUERR) force_sig_pkuerr((void __user *)address, pkey); else force_sig_fault(SIGSEGV, si_code, (void __user *)address); local_irq_disable(); } static noinline void bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, unsigned long address) { __bad_area_nosemaphore(regs, error_code, address, 0, SEGV_MAPERR); } static void __bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address, u32 pkey, int si_code) { struct mm_struct *mm = current->mm; /* * Something tried to access memory that isn't in our memory map.. * Fix it, but check if it's kernel or user first.. */ mmap_read_unlock(mm); __bad_area_nosemaphore(regs, error_code, address, pkey, si_code); } static inline bool bad_area_access_from_pkeys(unsigned long error_code, struct vm_area_struct *vma) { /* This code is always called on the current mm */ bool foreign = false; if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) return false; if (error_code & X86_PF_PK) return true; /* this checks permission keys on the VMA: */ if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE), (error_code & X86_PF_INSTR), foreign)) return true; return false; } static noinline void bad_area_access_error(struct pt_regs *regs, unsigned long error_code, unsigned long address, struct vm_area_struct *vma) { /* * This OSPKE check is not strictly necessary at runtime. * But, doing it this way allows compiler optimizations * if pkeys are compiled out. */ if (bad_area_access_from_pkeys(error_code, vma)) { /* * A protection key fault means that the PKRU value did not allow * access to some PTE. Userspace can figure out what PKRU was * from the XSAVE state. This function captures the pkey from * the vma and passes it to userspace so userspace can discover * which protection key was set on the PTE. * * If we get here, we know that the hardware signaled a X86_PF_PK * fault and that there was a VMA once we got in the fault * handler. It does *not* guarantee that the VMA we find here * was the one that we faulted on. * * 1. T1 : mprotect_key(foo, PAGE_SIZE, pkey=4); * 2. T1 : set PKRU to deny access to pkey=4, touches page * 3. T1 : faults... * 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5); * 5. T1 : enters fault handler, takes mmap_lock, etc... * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really * faulted on a pte with its pkey=4. */ u32 pkey = vma_pkey(vma); __bad_area(regs, error_code, address, pkey, SEGV_PKUERR); } else { __bad_area(regs, error_code, address, 0, SEGV_ACCERR); } } static void do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, vm_fault_t fault) { /* Kernel mode? Handle exceptions or die: */ if (!user_mode(regs)) { kernelmode_fixup_or_oops(regs, error_code, address, SIGBUS, BUS_ADRERR, ARCH_DEFAULT_PKEY); return; } /* User-space => ok to do another page fault: */ if (is_prefetch(regs, error_code, address)) return; sanitize_error_code(address, &error_code); if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address)) return; set_signal_archinfo(address, error_code); #ifdef CONFIG_MEMORY_FAILURE if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { struct task_struct *tsk = current; unsigned lsb = 0; pr_err( "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", tsk->comm, tsk->pid, address); if (fault & VM_FAULT_HWPOISON_LARGE) lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); if (fault & VM_FAULT_HWPOISON) lsb = PAGE_SHIFT; force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb); return; } #endif force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address); } static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte) { if ((error_code & X86_PF_WRITE) && !pte_write(*pte)) return 0; if ((error_code & X86_PF_INSTR) && !pte_exec(*pte)) return 0; return 1; } /* * Handle a spurious fault caused by a stale TLB entry. * * This allows us to lazily refresh the TLB when increasing the * permissions of a kernel page (RO -> RW or NX -> X). Doing it * eagerly is very expensive since that implies doing a full * cross-processor TLB flush, even if no stale TLB entries exist * on other processors. * * Spurious faults may only occur if the TLB contains an entry with * fewer permission than the page table entry. Non-present (P = 0) * and reserved bit (R = 1) faults are never spurious. * * There are no security implications to leaving a stale TLB when * increasing the permissions on a page. * * Returns non-zero if a spurious fault was handled, zero otherwise. * * See Intel Developer's Manual Vol 3 Section 4.10.4.3, bullet 3 * (Optional Invalidation). */ static noinline int spurious_kernel_fault(unsigned long error_code, unsigned long address) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; int ret; /* * Only writes to RO or instruction fetches from NX may cause * spurious faults. * * These could be from user or supervisor accesses but the TLB * is only lazily flushed after a kernel mapping protection * change, so user accesses are not expected to cause spurious * faults. */ if (error_code != (X86_PF_WRITE | X86_PF_PROT) && error_code != (X86_PF_INSTR | X86_PF_PROT)) return 0; pgd = init_mm.pgd + pgd_index(address); if (!pgd_present(*pgd)) return 0; p4d = p4d_offset(pgd, address); if (!p4d_present(*p4d)) return 0; if (p4d_large(*p4d)) return spurious_kernel_fault_check(error_code, (pte_t *) p4d); pud = pud_offset(p4d, address); if (!pud_present(*pud)) return 0; if (pud_large(*pud)) return spurious_kernel_fault_check(error_code, (pte_t *) pud); pmd = pmd_offset(pud, address); if (!pmd_present(*pmd)) return 0; if (pmd_large(*pmd)) return spurious_kernel_fault_check(error_code, (pte_t *) pmd); pte = pte_offset_kernel(pmd, address); if (!pte_present(*pte)) return 0; ret = spurious_kernel_fault_check(error_code, pte); if (!ret) return 0; /* * Make sure we have permissions in PMD. * If not, then there's a bug in the page tables: */ ret = spurious_kernel_fault_check(error_code, (pte_t *) pmd); WARN_ONCE(!ret, "PMD has incorrect permission bits\n"); return ret; } NOKPROBE_SYMBOL(spurious_kernel_fault); int show_unhandled_signals = 1; static inline int access_error(unsigned long error_code, struct vm_area_struct *vma) { /* This is only called for the current mm, so: */ bool foreign = false; /* * Read or write was blocked by protection keys. This is * always an unconditional error and can never result in * a follow-up action to resolve the fault, like a COW. */ if (error_code & X86_PF_PK) return 1; /* * SGX hardware blocked the access. This usually happens * when the enclave memory contents have been destroyed, like * after a suspend/resume cycle. In any case, the kernel can't * fix the cause of the fault. Handle the fault as an access * error even in cases where no actual access violation * occurred. This allows userspace to rebuild the enclave in * response to the signal. */ if (unlikely(error_code & X86_PF_SGX)) return 1; /* * Make sure to check the VMA so that we do not perform * faults just to hit a X86_PF_PK as soon as we fill in a * page. */ if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE), (error_code & X86_PF_INSTR), foreign)) return 1; /* * Shadow stack accesses (PF_SHSTK=1) are only permitted to * shadow stack VMAs. All other accesses result in an error. */ if (error_code & X86_PF_SHSTK) { if (unlikely(!(vma->vm_flags & VM_SHADOW_STACK))) return 1; if (unlikely(!(vma->vm_flags & VM_WRITE))) return 1; return 0; } if (error_code & X86_PF_WRITE) { /* write, present and write, not present: */ if (unlikely(vma->vm_flags & VM_SHADOW_STACK)) return 1; if (unlikely(!(vma->vm_flags & VM_WRITE))) return 1; return 0; } /* read, present: */ if (unlikely(error_code & X86_PF_PROT)) return 1; /* read, not present: */ if (unlikely(!vma_is_accessible(vma))) return 1; return 0; } bool fault_in_kernel_space(unsigned long address) { /* * On 64-bit systems, the vsyscall page is at an address above * TASK_SIZE_MAX, but is not considered part of the kernel * address space. */ if (IS_ENABLED(CONFIG_X86_64) && is_vsyscall_vaddr(address)) return false; return address >= TASK_SIZE_MAX; } /* * Called for all faults where 'address' is part of the kernel address * space. Might get called for faults that originate from *code* that * ran in userspace or the kernel. */ static void do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code, unsigned long address) { /* * Protection keys exceptions only happen on user pages. We * have no user pages in the kernel portion of the address * space, so do not expect them here. */ WARN_ON_ONCE(hw_error_code & X86_PF_PK); #ifdef CONFIG_X86_32 /* * We can fault-in kernel-space virtual memory on-demand. The * 'reference' page table is init_mm.pgd. * * NOTE! We MUST NOT take any locks for this case. We may * be in an interrupt or a critical region, and should * only copy the information from the master page table, * nothing more. * * Before doing this on-demand faulting, ensure that the * fault is not any of the following: * 1. A fault on a PTE with a reserved bit set. * 2. A fault caused by a user-mode access. (Do not demand- * fault kernel memory due to user-mode accesses). * 3. A fault caused by a page-level protection violation. * (A demand fault would be on a non-present page which * would have X86_PF_PROT==0). * * This is only needed to close a race condition on x86-32 in * the vmalloc mapping/unmapping code. See the comment above * vmalloc_fault() for details. On x86-64 the race does not * exist as the vmalloc mappings don't need to be synchronized * there. */ if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) { if (vmalloc_fault(address) >= 0) return; } #endif if (is_f00f_bug(regs, hw_error_code, address)) return; /* Was the fault spurious, caused by lazy TLB invalidation? */ if (spurious_kernel_fault(hw_error_code, address)) return; /* kprobes don't want to hook the spurious faults: */ if (WARN_ON_ONCE(kprobe_page_fault(regs, X86_TRAP_PF))) return; /* * Note, despite being a "bad area", there are quite a few * acceptable reasons to get here, such as erratum fixups * and handling kernel code that can fault, like get_user(). * * Don't take the mm semaphore here. If we fixup a prefetch * fault we could otherwise deadlock: */ bad_area_nosemaphore(regs, hw_error_code, address); } NOKPROBE_SYMBOL(do_kern_addr_fault); /* * Handle faults in the user portion of the address space. Nothing in here * should check X86_PF_USER without a specific justification: for almost * all purposes, we should treat a normal kernel access to user memory * (e.g. get_user(), put_user(), etc.) the same as the WRUSS instruction. * The one exception is AC flag handling, which is, per the x86 * architecture, special for WRUSS. */ static inline void do_user_addr_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address) { struct vm_area_struct *vma; struct task_struct *tsk; struct mm_struct *mm; vm_fault_t fault; unsigned int flags = FAULT_FLAG_DEFAULT; tsk = current; mm = tsk->mm; if (unlikely((error_code & (X86_PF_USER | X86_PF_INSTR)) == X86_PF_INSTR)) { /* * Whoops, this is kernel mode code trying to execute from * user memory. Unless this is AMD erratum #93, which * corrupts RIP such that it looks like a user address, * this is unrecoverable. Don't even try to look up the * VMA or look for extable entries. */ if (is_errata93(regs, address)) return; page_fault_oops(regs, error_code, address); return; } /* kprobes don't want to hook the spurious faults: */ if (WARN_ON_ONCE(kprobe_page_fault(regs, X86_TRAP_PF))) return; /* * Reserved bits are never expected to be set on * entries in the user portion of the page tables. */ if (unlikely(error_code & X86_PF_RSVD)) pgtable_bad(regs, error_code, address); /* * If SMAP is on, check for invalid kernel (supervisor) access to user * pages in the user address space. The odd case here is WRUSS, * which, according to the preliminary documentation, does not respect * SMAP and will have the USER bit set so, in all cases, SMAP * enforcement appears to be consistent with the USER bit. */ if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) && !(error_code & X86_PF_USER) && !(regs->flags & X86_EFLAGS_AC))) { /* * No extable entry here. This was a kernel access to an * invalid pointer. get_kernel_nofault() will not get here. */ page_fault_oops(regs, error_code, address); return; } /* * If we're in an interrupt, have no user context or are running * in a region with pagefaults disabled then we must not take the fault */ if (unlikely(faulthandler_disabled() || !mm)) { bad_area_nosemaphore(regs, error_code, address); return; } /* * It's safe to allow irq's after cr2 has been saved and the * vmalloc fault has been handled. * * User-mode registers count as a user access even for any * potential system fault or CPU buglet: */ if (user_mode(regs)) { local_irq_enable(); flags |= FAULT_FLAG_USER; } else { if (regs->flags & X86_EFLAGS_IF) local_irq_enable(); } perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); /* * Read-only permissions can not be expressed in shadow stack PTEs. * Treat all shadow stack accesses as WRITE faults. This ensures * that the MM will prepare everything (e.g., break COW) such that * maybe_mkwrite() can create a proper shadow stack PTE. */ if (error_code & X86_PF_SHSTK) flags |= FAULT_FLAG_WRITE; if (error_code & X86_PF_WRITE) flags |= FAULT_FLAG_WRITE; if (error_code & X86_PF_INSTR) flags |= FAULT_FLAG_INSTRUCTION; #ifdef CONFIG_X86_64 /* * Faults in the vsyscall page might need emulation. The * vsyscall page is at a high address (>PAGE_OFFSET), but is * considered to be part of the user address space. * * The vsyscall page does not have a "real" VMA, so do this * emulation before we go searching for VMAs. * * PKRU never rejects instruction fetches, so we don't need * to consider the PF_PK bit. */ if (is_vsyscall_vaddr(address)) { if (emulate_vsyscall(error_code, regs, address)) return; } #endif if (!(flags & FAULT_FLAG_USER)) goto lock_mmap; vma = lock_vma_under_rcu(mm, address); if (!vma) goto lock_mmap; if (unlikely(access_error(error_code, vma))) { vma_end_read(vma); goto lock_mmap; } fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs); if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) vma_end_read(vma); if (!(fault & VM_FAULT_RETRY)) { count_vm_vma_lock_event(VMA_LOCK_SUCCESS); goto done; } count_vm_vma_lock_event(VMA_LOCK_RETRY); if (fault & VM_FAULT_MAJOR) flags |= FAULT_FLAG_TRIED; /* Quick path to respond to signals */ if (fault_signal_pending(fault, regs)) { if (!user_mode(regs)) kernelmode_fixup_or_oops(regs, error_code, address, SIGBUS, BUS_ADRERR, ARCH_DEFAULT_PKEY); return; } lock_mmap: retry: vma = lock_mm_and_find_vma(mm, address, regs); if (unlikely(!vma)) { bad_area_nosemaphore(regs, error_code, address); return; } /* * Ok, we have a good vm_area for this memory access, so * we can handle it.. */ if (unlikely(access_error(error_code, vma))) { bad_area_access_error(regs, error_code, address, vma); return; } /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo * the fault. Since we never set FAULT_FLAG_RETRY_NOWAIT, if * we get VM_FAULT_RETRY back, the mmap_lock has been unlocked. * * Note that handle_userfault() may also release and reacquire mmap_lock * (and not return with VM_FAULT_RETRY), when returning to userland to * repeat the page fault later with a VM_FAULT_NOPAGE retval * (potentially after handling any pending signal during the return to * userland). The return to userland is identified whenever * FAULT_FLAG_USER|FAULT_FLAG_KILLABLE are both set in flags. */ fault = handle_mm_fault(vma, address, flags, regs); if (fault_signal_pending(fault, regs)) { /* * Quick path to respond to signals. The core mm code * has unlocked the mm for us if we get here. */ if (!user_mode(regs)) kernelmode_fixup_or_oops(regs, error_code, address, SIGBUS, BUS_ADRERR, ARCH_DEFAULT_PKEY); return; } /* The fault is fully completed (including releasing mmap lock) */ if (fault & VM_FAULT_COMPLETED) return; /* * If we need to retry the mmap_lock has already been released, * and if there is a fatal signal pending there is no guarantee * that we made any progress. Handle this case first. */ if (unlikely(fault & VM_FAULT_RETRY)) { flags |= FAULT_FLAG_TRIED; goto retry; } mmap_read_unlock(mm); done: if (likely(!(fault & VM_FAULT_ERROR))) return; if (fatal_signal_pending(current) && !user_mode(regs)) { kernelmode_fixup_or_oops(regs, error_code, address, 0, 0, ARCH_DEFAULT_PKEY); return; } if (fault & VM_FAULT_OOM) { /* Kernel mode? Handle exceptions or die: */ if (!user_mode(regs)) { kernelmode_fixup_or_oops(regs, error_code, address, SIGSEGV, SEGV_MAPERR, ARCH_DEFAULT_PKEY); return; } /* * We ran out of memory, call the OOM killer, and return the * userspace (which will retry the fault, or kill us if we got * oom-killed): */ pagefault_out_of_memory(); } else { if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| VM_FAULT_HWPOISON_LARGE)) do_sigbus(regs, error_code, address, fault); else if (fault & VM_FAULT_SIGSEGV) bad_area_nosemaphore(regs, error_code, address); else BUG(); } } NOKPROBE_SYMBOL(do_user_addr_fault); static __always_inline void trace_page_fault_entries(struct pt_regs *regs, unsigned long error_code, unsigned long address) { if (!trace_pagefault_enabled()) return; if (user_mode(regs)) trace_page_fault_user(address, regs, error_code); else trace_page_fault_kernel(address, regs, error_code); } static __always_inline void handle_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address) { trace_page_fault_entries(regs, error_code, address); if (unlikely(kmmio_fault(regs, address))) return; /* Was the fault on kernel-controlled part of the address space? */ if (unlikely(fault_in_kernel_space(address))) { do_kern_addr_fault(regs, error_code, address); } else { do_user_addr_fault(regs, error_code, address); /* * User address page fault handling might have reenabled * interrupts. Fixing up all potential exit points of * do_user_addr_fault() and its leaf functions is just not * doable w/o creating an unholy mess or turning the code * upside down. */ local_irq_disable(); } } DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault) { unsigned long address = read_cr2(); irqentry_state_t state; prefetchw(¤t->mm->mmap_lock); /* * KVM uses #PF vector to deliver 'page not present' events to guests * (asynchronous page fault mechanism). The event happens when a * userspace task is trying to access some valid (from guest's point of * view) memory which is not currently mapped by the host (e.g. the * memory is swapped out). Note, the corresponding "page ready" event * which is injected when the memory becomes available, is delivered via * an interrupt mechanism and not a #PF exception * (see arch/x86/kernel/kvm.c: sysvec_kvm_asyncpf_interrupt()). * * We are relying on the interrupted context being sane (valid RSP, * relevant locks not held, etc.), which is fine as long as the * interrupted context had IF=1. We are also relying on the KVM * async pf type field and CR2 being read consistently instead of * getting values from real and async page faults mixed up. * * Fingers crossed. * * The async #PF handling code takes care of idtentry handling * itself. */ if (kvm_handle_async_pf(regs, (u32)address)) return; /* * Entry handling for valid #PF from kernel mode is slightly * different: RCU is already watching and ct_irq_enter() must not * be invoked because a kernel fault on a user space address might * sleep. * * In case the fault hit a RCU idle region the conditional entry * code reenabled RCU to avoid subsequent wreckage which helps * debuggability. */ state = irqentry_enter(regs); instrumentation_begin(); handle_page_fault(regs, error_code, address); instrumentation_end(); irqentry_exit(regs, state); } |
814 735 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PARAVIRT_H #define _ASM_X86_PARAVIRT_H /* Various instructions on x86 need to be replaced for * para-virtualization: those hooks are defined here. */ #include <asm/paravirt_types.h> #ifndef __ASSEMBLY__ struct mm_struct; #endif #ifdef CONFIG_PARAVIRT #include <asm/pgtable_types.h> #include <asm/asm.h> #include <asm/nospec-branch.h> #ifndef __ASSEMBLY__ #include <linux/bug.h> #include <linux/types.h> #include <linux/cpumask.h> #include <linux/static_call_types.h> #include <asm/frame.h> u64 dummy_steal_clock(int cpu); u64 dummy_sched_clock(void); DECLARE_STATIC_CALL(pv_steal_clock, dummy_steal_clock); DECLARE_STATIC_CALL(pv_sched_clock, dummy_sched_clock); void paravirt_set_sched_clock(u64 (*func)(void)); static __always_inline u64 paravirt_sched_clock(void) { return static_call(pv_sched_clock)(); } struct static_key; extern struct static_key paravirt_steal_enabled; extern struct static_key paravirt_steal_rq_enabled; __visible void __native_queued_spin_unlock(struct qspinlock *lock); bool pv_is_native_spin_unlock(void); __visible bool __native_vcpu_is_preempted(long cpu); bool pv_is_native_vcpu_is_preempted(void); static inline u64 paravirt_steal_clock(int cpu) { return static_call(pv_steal_clock)(cpu); } #ifdef CONFIG_PARAVIRT_SPINLOCKS void __init paravirt_set_cap(void); #endif /* The paravirtualized I/O functions */ static inline void slow_down_io(void) { PVOP_VCALL0(cpu.io_delay); #ifdef REALLY_SLOW_IO PVOP_VCALL0(cpu.io_delay); PVOP_VCALL0(cpu.io_delay); PVOP_VCALL0(cpu.io_delay); #endif } void native_flush_tlb_local(void); void native_flush_tlb_global(void); void native_flush_tlb_one_user(unsigned long addr); void native_flush_tlb_multi(const struct cpumask *cpumask, const struct flush_tlb_info *info); static inline void __flush_tlb_local(void) { PVOP_VCALL0(mmu.flush_tlb_user); } static inline void __flush_tlb_global(void) { PVOP_VCALL0(mmu.flush_tlb_kernel); } static inline void __flush_tlb_one_user(unsigned long addr) { PVOP_VCALL1(mmu.flush_tlb_one_user, addr); } static inline void __flush_tlb_multi(const struct cpumask *cpumask, const struct flush_tlb_info *info) { PVOP_VCALL2(mmu.flush_tlb_multi, cpumask, info); } static inline void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table) { PVOP_VCALL2(mmu.tlb_remove_table, tlb, table); } static inline void paravirt_arch_exit_mmap(struct mm_struct *mm) { PVOP_VCALL1(mmu.exit_mmap, mm); } static inline void notify_page_enc_status_changed(unsigned long pfn, int npages, bool enc) { PVOP_VCALL3(mmu.notify_page_enc_status_changed, pfn, npages, enc); } #ifdef CONFIG_PARAVIRT_XXL static inline void load_sp0(unsigned long sp0) { PVOP_VCALL1(cpu.load_sp0, sp0); } /* The paravirtualized CPUID instruction. */ static inline void __cpuid(unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { PVOP_VCALL4(cpu.cpuid, eax, ebx, ecx, edx); } /* * These special macros can be used to get or set a debugging register */ static __always_inline unsigned long paravirt_get_debugreg(int reg) { return PVOP_CALL1(unsigned long, cpu.get_debugreg, reg); } #define get_debugreg(var, reg) var = paravirt_get_debugreg(reg) static __always_inline void set_debugreg(unsigned long val, int reg) { PVOP_VCALL2(cpu.set_debugreg, reg, val); } static inline unsigned long read_cr0(void) { return PVOP_CALL0(unsigned long, cpu.read_cr0); } static inline void write_cr0(unsigned long x) { PVOP_VCALL1(cpu.write_cr0, x); } static __always_inline unsigned long read_cr2(void) { return PVOP_ALT_CALLEE0(unsigned long, mmu.read_cr2, "mov %%cr2, %%rax;", ALT_NOT_XEN); } static __always_inline void write_cr2(unsigned long x) { PVOP_VCALL1(mmu.write_cr2, x); } static inline unsigned long __read_cr3(void) { return PVOP_ALT_CALL0(unsigned long, mmu.read_cr3, "mov %%cr3, %%rax;", ALT_NOT_XEN); } static inline void write_cr3(unsigned long x) { PVOP_ALT_VCALL1(mmu.write_cr3, x, "mov %%rdi, %%cr3", ALT_NOT_XEN); } static inline void __write_cr4(unsigned long x) { PVOP_VCALL1(cpu.write_cr4, x); } static __always_inline void arch_safe_halt(void) { PVOP_VCALL0(irq.safe_halt); } static inline void halt(void) { PVOP_VCALL0(irq.halt); } extern noinstr void pv_native_wbinvd(void); static __always_inline void wbinvd(void) { PVOP_ALT_VCALL0(cpu.wbinvd, "wbinvd", ALT_NOT_XEN); } static inline u64 paravirt_read_msr(unsigned msr) { return PVOP_CALL1(u64, cpu.read_msr, msr); } static inline void paravirt_write_msr(unsigned msr, unsigned low, unsigned high) { PVOP_VCALL3(cpu.write_msr, msr, low, high); } static inline u64 paravirt_read_msr_safe(unsigned msr, int *err) { return PVOP_CALL2(u64, cpu.read_msr_safe, msr, err); } static inline int paravirt_write_msr_safe(unsigned msr, unsigned low, unsigned high) { return PVOP_CALL3(int, cpu.write_msr_safe, msr, low, high); } #define rdmsr(msr, val1, val2) \ do { \ u64 _l = paravirt_read_msr(msr); \ val1 = (u32)_l; \ val2 = _l >> 32; \ } while (0) #define wrmsr(msr, val1, val2) \ do { \ paravirt_write_msr(msr, val1, val2); \ } while (0) #define rdmsrl(msr, val) \ do { \ val = paravirt_read_msr(msr); \ } while (0) static inline void wrmsrl(unsigned msr, u64 val) { wrmsr(msr, (u32)val, (u32)(val>>32)); } #define wrmsr_safe(msr, a, b) paravirt_write_msr_safe(msr, a, b) /* rdmsr with exception handling */ #define rdmsr_safe(msr, a, b) \ ({ \ int _err; \ u64 _l = paravirt_read_msr_safe(msr, &_err); \ (*a) = (u32)_l; \ (*b) = _l >> 32; \ _err; \ }) static inline int rdmsrl_safe(unsigned msr, unsigned long long *p) { int err; *p = paravirt_read_msr_safe(msr, &err); return err; } static inline unsigned long long paravirt_read_pmc(int counter) { return PVOP_CALL1(u64, cpu.read_pmc, counter); } #define rdpmc(counter, low, high) \ do { \ u64 _l = paravirt_read_pmc(counter); \ low = (u32)_l; \ high = _l >> 32; \ } while (0) #define rdpmcl(counter, val) ((val) = paravirt_read_pmc(counter)) static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries) { PVOP_VCALL2(cpu.alloc_ldt, ldt, entries); } static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries) { PVOP_VCALL2(cpu.free_ldt, ldt, entries); } static inline void load_TR_desc(void) { PVOP_VCALL0(cpu.load_tr_desc); } static inline void load_gdt(const struct desc_ptr *dtr) { PVOP_VCALL1(cpu.load_gdt, dtr); } static inline void load_idt(const struct desc_ptr *dtr) { PVOP_VCALL1(cpu.load_idt, dtr); } static inline void set_ldt(const void *addr, unsigned entries) { PVOP_VCALL2(cpu.set_ldt, addr, entries); } static inline unsigned long paravirt_store_tr(void) { return PVOP_CALL0(unsigned long, cpu.store_tr); } #define store_tr(tr) ((tr) = paravirt_store_tr()) static inline void load_TLS(struct thread_struct *t, unsigned cpu) { PVOP_VCALL2(cpu.load_tls, t, cpu); } static inline void load_gs_index(unsigned int gs) { PVOP_VCALL1(cpu.load_gs_index, gs); } static inline void write_ldt_entry(struct desc_struct *dt, int entry, const void *desc) { PVOP_VCALL3(cpu.write_ldt_entry, dt, entry, desc); } static inline void write_gdt_entry(struct desc_struct *dt, int entry, void *desc, int type) { PVOP_VCALL4(cpu.write_gdt_entry, dt, entry, desc, type); } static inline void write_idt_entry(gate_desc *dt, int entry, const gate_desc *g) { PVOP_VCALL3(cpu.write_idt_entry, dt, entry, g); } #ifdef CONFIG_X86_IOPL_IOPERM static inline void tss_invalidate_io_bitmap(void) { PVOP_VCALL0(cpu.invalidate_io_bitmap); } static inline void tss_update_io_bitmap(void) { PVOP_VCALL0(cpu.update_io_bitmap); } #endif static inline void paravirt_enter_mmap(struct mm_struct *next) { PVOP_VCALL1(mmu.enter_mmap, next); } static inline int paravirt_pgd_alloc(struct mm_struct *mm) { return PVOP_CALL1(int, mmu.pgd_alloc, mm); } static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd) { PVOP_VCALL2(mmu.pgd_free, mm, pgd); } static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn) { PVOP_VCALL2(mmu.alloc_pte, mm, pfn); } static inline void paravirt_release_pte(unsigned long pfn) { PVOP_VCALL1(mmu.release_pte, pfn); } static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn) { PVOP_VCALL2(mmu.alloc_pmd, mm, pfn); } static inline void paravirt_release_pmd(unsigned long pfn) { PVOP_VCALL1(mmu.release_pmd, pfn); } static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn) { PVOP_VCALL2(mmu.alloc_pud, mm, pfn); } static inline void paravirt_release_pud(unsigned long pfn) { PVOP_VCALL1(mmu.release_pud, pfn); } static inline void paravirt_alloc_p4d(struct mm_struct *mm, unsigned long pfn) { PVOP_VCALL2(mmu.alloc_p4d, mm, pfn); } static inline void paravirt_release_p4d(unsigned long pfn) { PVOP_VCALL1(mmu.release_p4d, pfn); } static inline pte_t __pte(pteval_t val) { return (pte_t) { PVOP_ALT_CALLEE1(pteval_t, mmu.make_pte, val, "mov %%rdi, %%rax", ALT_NOT_XEN) }; } static inline pteval_t pte_val(pte_t pte) { return PVOP_ALT_CALLEE1(pteval_t, mmu.pte_val, pte.pte, "mov %%rdi, %%rax", ALT_NOT_XEN); } static inline pgd_t __pgd(pgdval_t val) { return (pgd_t) { PVOP_ALT_CALLEE1(pgdval_t, mmu.make_pgd, val, "mov %%rdi, %%rax", ALT_NOT_XEN) }; } static inline pgdval_t pgd_val(pgd_t pgd) { return PVOP_ALT_CALLEE1(pgdval_t, mmu.pgd_val, pgd.pgd, "mov %%rdi, %%rax", ALT_NOT_XEN); } #define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { pteval_t ret; ret = PVOP_CALL3(pteval_t, mmu.ptep_modify_prot_start, vma, addr, ptep); return (pte_t) { .pte = ret }; } static inline void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t pte) { PVOP_VCALL4(mmu.ptep_modify_prot_commit, vma, addr, ptep, pte.pte); } static inline void set_pte(pte_t *ptep, pte_t pte) { PVOP_VCALL2(mmu.set_pte, ptep, pte.pte); } static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) { PVOP_VCALL2(mmu.set_pmd, pmdp, native_pmd_val(pmd)); } static inline pmd_t __pmd(pmdval_t val) { return (pmd_t) { PVOP_ALT_CALLEE1(pmdval_t, mmu.make_pmd, val, "mov %%rdi, %%rax", ALT_NOT_XEN) }; } static inline pmdval_t pmd_val(pmd_t pmd) { return PVOP_ALT_CALLEE1(pmdval_t, mmu.pmd_val, pmd.pmd, "mov %%rdi, %%rax", ALT_NOT_XEN); } static inline void set_pud(pud_t *pudp, pud_t pud) { PVOP_VCALL2(mmu.set_pud, pudp, native_pud_val(pud)); } static inline pud_t __pud(pudval_t val) { pudval_t ret; ret = PVOP_ALT_CALLEE1(pudval_t, mmu.make_pud, val, "mov %%rdi, %%rax", ALT_NOT_XEN); return (pud_t) { ret }; } static inline pudval_t pud_val(pud_t pud) { return PVOP_ALT_CALLEE1(pudval_t, mmu.pud_val, pud.pud, "mov %%rdi, %%rax", ALT_NOT_XEN); } static inline void pud_clear(pud_t *pudp) { set_pud(pudp, native_make_pud(0)); } static inline void set_p4d(p4d_t *p4dp, p4d_t p4d) { p4dval_t val = native_p4d_val(p4d); PVOP_VCALL2(mmu.set_p4d, p4dp, val); } #if CONFIG_PGTABLE_LEVELS >= 5 static inline p4d_t __p4d(p4dval_t val) { p4dval_t ret = PVOP_ALT_CALLEE1(p4dval_t, mmu.make_p4d, val, "mov %%rdi, %%rax", ALT_NOT_XEN); return (p4d_t) { ret }; } static inline p4dval_t p4d_val(p4d_t p4d) { return PVOP_ALT_CALLEE1(p4dval_t, mmu.p4d_val, p4d.p4d, "mov %%rdi, %%rax", ALT_NOT_XEN); } static inline void __set_pgd(pgd_t *pgdp, pgd_t pgd) { PVOP_VCALL2(mmu.set_pgd, pgdp, native_pgd_val(pgd)); } #define set_pgd(pgdp, pgdval) do { \ if (pgtable_l5_enabled()) \ __set_pgd(pgdp, pgdval); \ else \ set_p4d((p4d_t *)(pgdp), (p4d_t) { (pgdval).pgd }); \ } while (0) #define pgd_clear(pgdp) do { \ if (pgtable_l5_enabled()) \ set_pgd(pgdp, native_make_pgd(0)); \ } while (0) #endif /* CONFIG_PGTABLE_LEVELS == 5 */ static inline void p4d_clear(p4d_t *p4dp) { set_p4d(p4dp, native_make_p4d(0)); } static inline void set_pte_atomic(pte_t *ptep, pte_t pte) { set_pte(ptep, pte); } static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { set_pte(ptep, native_make_pte(0)); } static inline void pmd_clear(pmd_t *pmdp) { set_pmd(pmdp, native_make_pmd(0)); } #define __HAVE_ARCH_START_CONTEXT_SWITCH static inline void arch_start_context_switch(struct task_struct *prev) { PVOP_VCALL1(cpu.start_context_switch, prev); } static inline void arch_end_context_switch(struct task_struct *next) { PVOP_VCALL1(cpu.end_context_switch, next); } #define __HAVE_ARCH_ENTER_LAZY_MMU_MODE static inline void arch_enter_lazy_mmu_mode(void) { PVOP_VCALL0(mmu.lazy_mode.enter); } static inline void arch_leave_lazy_mmu_mode(void) { PVOP_VCALL0(mmu.lazy_mode.leave); } static inline void arch_flush_lazy_mmu_mode(void) { PVOP_VCALL0(mmu.lazy_mode.flush); } static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx, phys_addr_t phys, pgprot_t flags) { pv_ops.mmu.set_fixmap(idx, phys, flags); } #endif #if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS) static __always_inline void pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) { PVOP_VCALL2(lock.queued_spin_lock_slowpath, lock, val); } static __always_inline void pv_queued_spin_unlock(struct qspinlock *lock) { PVOP_ALT_VCALLEE1(lock.queued_spin_unlock, lock, "movb $0, (%%" _ASM_ARG1 ");", ALT_NOT(X86_FEATURE_PVUNLOCK)); } static __always_inline void pv_wait(u8 *ptr, u8 val) { PVOP_VCALL2(lock.wait, ptr, val); } static __always_inline void pv_kick(int cpu) { PVOP_VCALL1(lock.kick, cpu); } static __always_inline bool pv_vcpu_is_preempted(long cpu) { return PVOP_ALT_CALLEE1(bool, lock.vcpu_is_preempted, cpu, "xor %%" _ASM_AX ", %%" _ASM_AX ";", ALT_NOT(X86_FEATURE_VCPUPREEMPT)); } void __raw_callee_save___native_queued_spin_unlock(struct qspinlock *lock); bool __raw_callee_save___native_vcpu_is_preempted(long cpu); #endif /* SMP && PARAVIRT_SPINLOCKS */ #ifdef CONFIG_X86_32 /* save and restore all caller-save registers, except return value */ #define PV_SAVE_ALL_CALLER_REGS "pushl %ecx;" #define PV_RESTORE_ALL_CALLER_REGS "popl %ecx;" #else /* save and restore all caller-save registers, except return value */ #define PV_SAVE_ALL_CALLER_REGS \ "push %rcx;" \ "push %rdx;" \ "push %rsi;" \ "push %rdi;" \ "push %r8;" \ "push %r9;" \ "push %r10;" \ "push %r11;" #define PV_RESTORE_ALL_CALLER_REGS \ "pop %r11;" \ "pop %r10;" \ "pop %r9;" \ "pop %r8;" \ "pop %rdi;" \ "pop %rsi;" \ "pop %rdx;" \ "pop %rcx;" #endif /* * Generate a thunk around a function which saves all caller-save * registers except for the return value. This allows C functions to * be called from assembler code where fewer than normal registers are * available. It may also help code generation around calls from C * code if the common case doesn't use many registers. * * When a callee is wrapped in a thunk, the caller can assume that all * arg regs and all scratch registers are preserved across the * call. The return value in rax/eax will not be saved, even for void * functions. */ #define PV_THUNK_NAME(func) "__raw_callee_save_" #func #define __PV_CALLEE_SAVE_REGS_THUNK(func, section) \ extern typeof(func) __raw_callee_save_##func; \ \ asm(".pushsection " section ", \"ax\";" \ ".globl " PV_THUNK_NAME(func) ";" \ ".type " PV_THUNK_NAME(func) ", @function;" \ ASM_FUNC_ALIGN \ PV_THUNK_NAME(func) ":" \ ASM_ENDBR \ FRAME_BEGIN \ PV_SAVE_ALL_CALLER_REGS \ "call " #func ";" \ PV_RESTORE_ALL_CALLER_REGS \ FRAME_END \ ASM_RET \ ".size " PV_THUNK_NAME(func) ", .-" PV_THUNK_NAME(func) ";" \ ".popsection") #define PV_CALLEE_SAVE_REGS_THUNK(func) \ __PV_CALLEE_SAVE_REGS_THUNK(func, ".text") /* Get a reference to a callee-save function */ #define PV_CALLEE_SAVE(func) \ ((struct paravirt_callee_save) { __raw_callee_save_##func }) /* Promise that "func" already uses the right calling convention */ #define __PV_IS_CALLEE_SAVE(func) \ ((struct paravirt_callee_save) { func }) #ifdef CONFIG_PARAVIRT_XXL static __always_inline unsigned long arch_local_save_flags(void) { return PVOP_ALT_CALLEE0(unsigned long, irq.save_fl, "pushf; pop %%rax;", ALT_NOT_XEN); } static __always_inline void arch_local_irq_disable(void) { PVOP_ALT_VCALLEE0(irq.irq_disable, "cli;", ALT_NOT_XEN); } static __always_inline void arch_local_irq_enable(void) { PVOP_ALT_VCALLEE0(irq.irq_enable, "sti;", ALT_NOT_XEN); } static __always_inline unsigned long arch_local_irq_save(void) { unsigned long f; f = arch_local_save_flags(); arch_local_irq_disable(); return f; } #endif /* Make sure as little as possible of this mess escapes. */ #undef PARAVIRT_CALL #undef __PVOP_CALL #undef __PVOP_VCALL #undef PVOP_VCALL0 #undef PVOP_CALL0 #undef PVOP_VCALL1 #undef PVOP_CALL1 #undef PVOP_VCALL2 #undef PVOP_CALL2 #undef PVOP_VCALL3 #undef PVOP_CALL3 #undef PVOP_VCALL4 #undef PVOP_CALL4 extern void default_banner(void); void native_pv_lock_init(void) __init; #else /* __ASSEMBLY__ */ #ifdef CONFIG_X86_64 #ifdef CONFIG_PARAVIRT_XXL #ifdef CONFIG_DEBUG_ENTRY #define PARA_INDIRECT(addr) *addr(%rip) .macro PARA_IRQ_save_fl ANNOTATE_RETPOLINE_SAFE; call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl); .endm #define SAVE_FLAGS ALTERNATIVE_2 "PARA_IRQ_save_fl;", \ "ALT_CALL_INSTR;", ALT_CALL_ALWAYS, \ "pushf; pop %rax;", ALT_NOT_XEN #endif #endif /* CONFIG_PARAVIRT_XXL */ #endif /* CONFIG_X86_64 */ #endif /* __ASSEMBLY__ */ #else /* CONFIG_PARAVIRT */ # define default_banner x86_init_noop #ifndef __ASSEMBLY__ static inline void native_pv_lock_init(void) { } #endif #endif /* !CONFIG_PARAVIRT */ #ifndef __ASSEMBLY__ #ifndef CONFIG_PARAVIRT_XXL static inline void paravirt_enter_mmap(struct mm_struct *mm) { } #endif #ifndef CONFIG_PARAVIRT static inline void paravirt_arch_exit_mmap(struct mm_struct *mm) { } #endif #ifndef CONFIG_PARAVIRT_SPINLOCKS static inline void paravirt_set_cap(void) { } #endif #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_PARAVIRT_H */ |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef BTRFS_INODE_ITEM_H #define BTRFS_INODE_ITEM_H #include <linux/types.h> #include <linux/crc32c.h> struct btrfs_trans_handle; struct btrfs_root; struct btrfs_path; struct btrfs_key; struct btrfs_inode_extref; struct btrfs_inode; struct extent_buffer; struct fscrypt_str; /* * Return this if we need to call truncate_block for the last bit of the * truncate. */ #define BTRFS_NEED_TRUNCATE_BLOCK 1 struct btrfs_truncate_control { /* * IN: the inode we're operating on, this can be NULL if * ->clear_extent_range is false. */ struct btrfs_inode *inode; /* IN: the size we're truncating to. */ u64 new_size; /* OUT: the number of extents truncated. */ u64 extents_found; /* OUT: the last size we truncated this inode to. */ u64 last_size; /* OUT: the number of bytes to sub from this inode. */ u64 sub_bytes; /* IN: the ino we are truncating. */ u64 ino; /* * IN: minimum key type to remove. All key types with this type are * removed only if their offset >= new_size. */ u32 min_type; /* * IN: true if we don't want to do extent reference updates for any file * extents we drop. */ bool skip_ref_updates; /* * IN: true if we need to clear the file extent range for the inode as * we drop the file extent items. */ bool clear_extent_range; }; /* * btrfs_inode_item stores flags in a u64, btrfs_inode stores them in two * separate u32s. These two functions convert between the two representations. */ static inline u64 btrfs_inode_combine_flags(u32 flags, u32 ro_flags) { return (flags | ((u64)ro_flags << 32)); } static inline void btrfs_inode_split_flags(u64 inode_item_flags, u32 *flags, u32 *ro_flags) { *flags = (u32)inode_item_flags; *ro_flags = (u32)(inode_item_flags >> 32); } /* Figure the key offset of an extended inode ref. */ static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name, int len) { return (u64)crc32c(parent_objectid, name, len); } int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_truncate_control *control); int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct fscrypt_str *name, u64 inode_objectid, u64 ref_objectid, u64 index); int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct fscrypt_str *name, u64 inode_objectid, u64 ref_objectid, u64 *index); int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid); int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *location, int mod); struct btrfs_inode_extref *btrfs_lookup_inode_extref( struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, const struct fscrypt_str *name, u64 inode_objectid, u64 ref_objectid, int ins_len, int cow); struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, int slot, const struct fscrypt_str *name); struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( struct extent_buffer *leaf, int slot, u64 ref_objectid, const struct fscrypt_str *name); #endif |
3 1 6 1 2 4 4 4 15 29 6 11 15 26 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 | // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2020 Cloudflare Ltd https://cloudflare.com */ #include <linux/skmsg.h> #include <net/sock.h> #include <net/udp.h> #include <net/inet_common.h> #include "udp_impl.h" static struct proto *udpv6_prot_saved __read_mostly; static int sk_udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) return udpv6_prot_saved->recvmsg(sk, msg, len, flags, addr_len); #endif return udp_prot.recvmsg(sk, msg, len, flags, addr_len); } static bool udp_sk_has_data(struct sock *sk) { return !skb_queue_empty(&udp_sk(sk)->reader_queue) || !skb_queue_empty(&sk->sk_receive_queue); } static bool psock_has_data(struct sk_psock *psock) { return !skb_queue_empty(&psock->ingress_skb) || !sk_psock_queue_empty(psock); } #define udp_msg_has_data(__sk, __psock) \ ({ udp_sk_has_data(__sk) || psock_has_data(__psock); }) static int udp_msg_wait_data(struct sock *sk, struct sk_psock *psock, long timeo) { DEFINE_WAIT_FUNC(wait, woken_wake_function); int ret = 0; if (sk->sk_shutdown & RCV_SHUTDOWN) return 1; if (!timeo) return ret; add_wait_queue(sk_sleep(sk), &wait); sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); ret = udp_msg_has_data(sk, psock); if (!ret) { wait_woken(&wait, TASK_INTERRUPTIBLE, timeo); ret = udp_msg_has_data(sk, psock); } sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); remove_wait_queue(sk_sleep(sk), &wait); return ret; } static int udp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { struct sk_psock *psock; int copied, ret; if (unlikely(flags & MSG_ERRQUEUE)) return inet_recv_error(sk, msg, len, addr_len); if (!len) return 0; psock = sk_psock_get(sk); if (unlikely(!psock)) return sk_udp_recvmsg(sk, msg, len, flags, addr_len); if (!psock_has_data(psock)) { ret = sk_udp_recvmsg(sk, msg, len, flags, addr_len); goto out; } msg_bytes_ready: copied = sk_msg_recvmsg(sk, psock, msg, len, flags); if (!copied) { long timeo; int data; timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); data = udp_msg_wait_data(sk, psock, timeo); if (data) { if (psock_has_data(psock)) goto msg_bytes_ready; ret = sk_udp_recvmsg(sk, msg, len, flags, addr_len); goto out; } copied = -EAGAIN; } ret = copied; out: sk_psock_put(sk, psock); return ret; } enum { UDP_BPF_IPV4, UDP_BPF_IPV6, UDP_BPF_NUM_PROTS, }; static DEFINE_SPINLOCK(udpv6_prot_lock); static struct proto udp_bpf_prots[UDP_BPF_NUM_PROTS]; static void udp_bpf_rebuild_protos(struct proto *prot, const struct proto *base) { *prot = *base; prot->close = sock_map_close; prot->recvmsg = udp_bpf_recvmsg; prot->sock_is_readable = sk_msg_is_readable; } static void udp_bpf_check_v6_needs_rebuild(struct proto *ops) { if (unlikely(ops != smp_load_acquire(&udpv6_prot_saved))) { spin_lock_bh(&udpv6_prot_lock); if (likely(ops != udpv6_prot_saved)) { udp_bpf_rebuild_protos(&udp_bpf_prots[UDP_BPF_IPV6], ops); smp_store_release(&udpv6_prot_saved, ops); } spin_unlock_bh(&udpv6_prot_lock); } } static int __init udp_bpf_v4_build_proto(void) { udp_bpf_rebuild_protos(&udp_bpf_prots[UDP_BPF_IPV4], &udp_prot); return 0; } late_initcall(udp_bpf_v4_build_proto); int udp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) { int family = sk->sk_family == AF_INET ? UDP_BPF_IPV4 : UDP_BPF_IPV6; if (restore) { sk->sk_write_space = psock->saved_write_space; sock_replace_proto(sk, psock->sk_proto); return 0; } if (sk->sk_family == AF_INET6) udp_bpf_check_v6_needs_rebuild(psock->sk_proto); sock_replace_proto(sk, &udp_bpf_prots[family]); return 0; } EXPORT_SYMBOL_GPL(udp_bpf_update_proto); |
9648 2731 3476 7942 121 8193 363 730 977 16 3844 2 217 5916 46 160 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 | // SPDX-License-Identifier: GPL-2.0 // Generated by scripts/atomic/gen-atomic-instrumented.sh // DO NOT MODIFY THIS FILE DIRECTLY /* * This file provoides atomic operations with explicit instrumentation (e.g. * KASAN, KCSAN), which should be used unless it is necessary to avoid * instrumentation. Where it is necessary to aovid instrumenation, the * raw_atomic*() operations should be used. */ #ifndef _LINUX_ATOMIC_INSTRUMENTED_H #define _LINUX_ATOMIC_INSTRUMENTED_H #include <linux/build_bug.h> #include <linux/compiler.h> #include <linux/instrumented.h> /** * atomic_read() - atomic load with relaxed ordering * @v: pointer to atomic_t * * Atomically loads the value of @v with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_read() there. * * Return: The value loaded from @v. */ static __always_inline int atomic_read(const atomic_t *v) { instrument_atomic_read(v, sizeof(*v)); return raw_atomic_read(v); } /** * atomic_read_acquire() - atomic load with acquire ordering * @v: pointer to atomic_t * * Atomically loads the value of @v with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_read_acquire() there. * * Return: The value loaded from @v. */ static __always_inline int atomic_read_acquire(const atomic_t *v) { instrument_atomic_read(v, sizeof(*v)); return raw_atomic_read_acquire(v); } /** * atomic_set() - atomic set with relaxed ordering * @v: pointer to atomic_t * @i: int value to assign * * Atomically sets @v to @i with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_set() there. * * Return: Nothing. */ static __always_inline void atomic_set(atomic_t *v, int i) { instrument_atomic_write(v, sizeof(*v)); raw_atomic_set(v, i); } /** * atomic_set_release() - atomic set with release ordering * @v: pointer to atomic_t * @i: int value to assign * * Atomically sets @v to @i with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_set_release() there. * * Return: Nothing. */ static __always_inline void atomic_set_release(atomic_t *v, int i) { kcsan_release(); instrument_atomic_write(v, sizeof(*v)); raw_atomic_set_release(v, i); } /** * atomic_add() - atomic add with relaxed ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_add() there. * * Return: Nothing. */ static __always_inline void atomic_add(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic_add(i, v); } /** * atomic_add_return() - atomic add with full ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_add_return() there. * * Return: The updated value of @v. */ static __always_inline int atomic_add_return(int i, atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_add_return(i, v); } /** * atomic_add_return_acquire() - atomic add with acquire ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_add_return_acquire() there. * * Return: The updated value of @v. */ static __always_inline int atomic_add_return_acquire(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_add_return_acquire(i, v); } /** * atomic_add_return_release() - atomic add with release ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_add_return_release() there. * * Return: The updated value of @v. */ static __always_inline int atomic_add_return_release(int i, atomic_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_add_return_release(i, v); } /** * atomic_add_return_relaxed() - atomic add with relaxed ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_add_return_relaxed() there. * * Return: The updated value of @v. */ static __always_inline int atomic_add_return_relaxed(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_add_return_relaxed(i, v); } /** * atomic_fetch_add() - atomic add with full ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_add() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_add(int i, atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_add(i, v); } /** * atomic_fetch_add_acquire() - atomic add with acquire ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_add_acquire() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_add_acquire(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_add_acquire(i, v); } /** * atomic_fetch_add_release() - atomic add with release ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_add_release() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_add_release(int i, atomic_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_add_release(i, v); } /** * atomic_fetch_add_relaxed() - atomic add with relaxed ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_add_relaxed() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_add_relaxed(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_add_relaxed(i, v); } /** * atomic_sub() - atomic subtract with relaxed ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_sub() there. * * Return: Nothing. */ static __always_inline void atomic_sub(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic_sub(i, v); } /** * atomic_sub_return() - atomic subtract with full ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_sub_return() there. * * Return: The updated value of @v. */ static __always_inline int atomic_sub_return(int i, atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_sub_return(i, v); } /** * atomic_sub_return_acquire() - atomic subtract with acquire ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_sub_return_acquire() there. * * Return: The updated value of @v. */ static __always_inline int atomic_sub_return_acquire(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_sub_return_acquire(i, v); } /** * atomic_sub_return_release() - atomic subtract with release ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_sub_return_release() there. * * Return: The updated value of @v. */ static __always_inline int atomic_sub_return_release(int i, atomic_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_sub_return_release(i, v); } /** * atomic_sub_return_relaxed() - atomic subtract with relaxed ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_sub_return_relaxed() there. * * Return: The updated value of @v. */ static __always_inline int atomic_sub_return_relaxed(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_sub_return_relaxed(i, v); } /** * atomic_fetch_sub() - atomic subtract with full ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_sub() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_sub(int i, atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_sub(i, v); } /** * atomic_fetch_sub_acquire() - atomic subtract with acquire ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_sub_acquire() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_sub_acquire(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_sub_acquire(i, v); } /** * atomic_fetch_sub_release() - atomic subtract with release ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_sub_release() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_sub_release(int i, atomic_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_sub_release(i, v); } /** * atomic_fetch_sub_relaxed() - atomic subtract with relaxed ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_sub_relaxed() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_sub_relaxed(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_sub_relaxed(i, v); } /** * atomic_inc() - atomic increment with relaxed ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_inc() there. * * Return: Nothing. */ static __always_inline void atomic_inc(atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic_inc(v); } /** * atomic_inc_return() - atomic increment with full ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_inc_return() there. * * Return: The updated value of @v. */ static __always_inline int atomic_inc_return(atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_inc_return(v); } /** * atomic_inc_return_acquire() - atomic increment with acquire ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_inc_return_acquire() there. * * Return: The updated value of @v. */ static __always_inline int atomic_inc_return_acquire(atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_inc_return_acquire(v); } /** * atomic_inc_return_release() - atomic increment with release ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_inc_return_release() there. * * Return: The updated value of @v. */ static __always_inline int atomic_inc_return_release(atomic_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_inc_return_release(v); } /** * atomic_inc_return_relaxed() - atomic increment with relaxed ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_inc_return_relaxed() there. * * Return: The updated value of @v. */ static __always_inline int atomic_inc_return_relaxed(atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_inc_return_relaxed(v); } /** * atomic_fetch_inc() - atomic increment with full ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_inc() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_inc(atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_inc(v); } /** * atomic_fetch_inc_acquire() - atomic increment with acquire ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_inc_acquire() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_inc_acquire(atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_inc_acquire(v); } /** * atomic_fetch_inc_release() - atomic increment with release ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_inc_release() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_inc_release(atomic_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_inc_release(v); } /** * atomic_fetch_inc_relaxed() - atomic increment with relaxed ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_inc_relaxed() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_inc_relaxed(atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_inc_relaxed(v); } /** * atomic_dec() - atomic decrement with relaxed ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_dec() there. * * Return: Nothing. */ static __always_inline void atomic_dec(atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic_dec(v); } /** * atomic_dec_return() - atomic decrement with full ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_dec_return() there. * * Return: The updated value of @v. */ static __always_inline int atomic_dec_return(atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_dec_return(v); } /** * atomic_dec_return_acquire() - atomic decrement with acquire ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_dec_return_acquire() there. * * Return: The updated value of @v. */ static __always_inline int atomic_dec_return_acquire(atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_dec_return_acquire(v); } /** * atomic_dec_return_release() - atomic decrement with release ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_dec_return_release() there. * * Return: The updated value of @v. */ static __always_inline int atomic_dec_return_release(atomic_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_dec_return_release(v); } /** * atomic_dec_return_relaxed() - atomic decrement with relaxed ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_dec_return_relaxed() there. * * Return: The updated value of @v. */ static __always_inline int atomic_dec_return_relaxed(atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_dec_return_relaxed(v); } /** * atomic_fetch_dec() - atomic decrement with full ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_dec() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_dec(atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_dec(v); } /** * atomic_fetch_dec_acquire() - atomic decrement with acquire ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_dec_acquire() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_dec_acquire(atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_dec_acquire(v); } /** * atomic_fetch_dec_release() - atomic decrement with release ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_dec_release() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_dec_release(atomic_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_dec_release(v); } /** * atomic_fetch_dec_relaxed() - atomic decrement with relaxed ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_dec_relaxed() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_dec_relaxed(atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_dec_relaxed(v); } /** * atomic_and() - atomic bitwise AND with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_and() there. * * Return: Nothing. */ static __always_inline void atomic_and(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic_and(i, v); } /** * atomic_fetch_and() - atomic bitwise AND with full ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_and() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_and(int i, atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_and(i, v); } /** * atomic_fetch_and_acquire() - atomic bitwise AND with acquire ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_and_acquire() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_and_acquire(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_and_acquire(i, v); } /** * atomic_fetch_and_release() - atomic bitwise AND with release ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_and_release() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_and_release(int i, atomic_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_and_release(i, v); } /** * atomic_fetch_and_relaxed() - atomic bitwise AND with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_and_relaxed() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_and_relaxed(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_and_relaxed(i, v); } /** * atomic_andnot() - atomic bitwise AND NOT with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & ~@i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_andnot() there. * * Return: Nothing. */ static __always_inline void atomic_andnot(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic_andnot(i, v); } /** * atomic_fetch_andnot() - atomic bitwise AND NOT with full ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & ~@i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_andnot() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_andnot(int i, atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_andnot(i, v); } /** * atomic_fetch_andnot_acquire() - atomic bitwise AND NOT with acquire ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & ~@i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_andnot_acquire() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_andnot_acquire(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_andnot_acquire(i, v); } /** * atomic_fetch_andnot_release() - atomic bitwise AND NOT with release ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & ~@i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_andnot_release() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_andnot_release(int i, atomic_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_andnot_release(i, v); } /** * atomic_fetch_andnot_relaxed() - atomic bitwise AND NOT with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & ~@i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_andnot_relaxed() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_andnot_relaxed(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_andnot_relaxed(i, v); } /** * atomic_or() - atomic bitwise OR with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v | @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_or() there. * * Return: Nothing. */ static __always_inline void atomic_or(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic_or(i, v); } /** * atomic_fetch_or() - atomic bitwise OR with full ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v | @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_or() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_or(int i, atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_or(i, v); } /** * atomic_fetch_or_acquire() - atomic bitwise OR with acquire ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v | @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_or_acquire() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_or_acquire(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_or_acquire(i, v); } /** * atomic_fetch_or_release() - atomic bitwise OR with release ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v | @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_or_release() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_or_release(int i, atomic_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_or_release(i, v); } /** * atomic_fetch_or_relaxed() - atomic bitwise OR with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v | @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_or_relaxed() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_or_relaxed(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_or_relaxed(i, v); } /** * atomic_xor() - atomic bitwise XOR with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v ^ @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_xor() there. * * Return: Nothing. */ static __always_inline void atomic_xor(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic_xor(i, v); } /** * atomic_fetch_xor() - atomic bitwise XOR with full ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v ^ @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_xor() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_xor(int i, atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_xor(i, v); } /** * atomic_fetch_xor_acquire() - atomic bitwise XOR with acquire ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v ^ @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_xor_acquire() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_xor_acquire(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_xor_acquire(i, v); } /** * atomic_fetch_xor_release() - atomic bitwise XOR with release ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v ^ @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_xor_release() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_xor_release(int i, atomic_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_xor_release(i, v); } /** * atomic_fetch_xor_relaxed() - atomic bitwise XOR with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v ^ @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_xor_relaxed() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_xor_relaxed(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_xor_relaxed(i, v); } /** * atomic_xchg() - atomic exchange with full ordering * @v: pointer to atomic_t * @new: int value to assign * * Atomically updates @v to @new with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_xchg() there. * * Return: The original value of @v. */ static __always_inline int atomic_xchg(atomic_t *v, int new) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_xchg(v, new); } /** * atomic_xchg_acquire() - atomic exchange with acquire ordering * @v: pointer to atomic_t * @new: int value to assign * * Atomically updates @v to @new with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_xchg_acquire() there. * * Return: The original value of @v. */ static __always_inline int atomic_xchg_acquire(atomic_t *v, int new) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_xchg_acquire(v, new); } /** * atomic_xchg_release() - atomic exchange with release ordering * @v: pointer to atomic_t * @new: int value to assign * * Atomically updates @v to @new with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_xchg_release() there. * * Return: The original value of @v. */ static __always_inline int atomic_xchg_release(atomic_t *v, int new) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_xchg_release(v, new); } /** * atomic_xchg_relaxed() - atomic exchange with relaxed ordering * @v: pointer to atomic_t * @new: int value to assign * * Atomically updates @v to @new with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_xchg_relaxed() there. * * Return: The original value of @v. */ static __always_inline int atomic_xchg_relaxed(atomic_t *v, int new) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_xchg_relaxed(v, new); } /** * atomic_cmpxchg() - atomic compare and exchange with full ordering * @v: pointer to atomic_t * @old: int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_cmpxchg() there. * * Return: The original value of @v. */ static __always_inline int atomic_cmpxchg(atomic_t *v, int old, int new) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_cmpxchg(v, old, new); } /** * atomic_cmpxchg_acquire() - atomic compare and exchange with acquire ordering * @v: pointer to atomic_t * @old: int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_cmpxchg_acquire() there. * * Return: The original value of @v. */ static __always_inline int atomic_cmpxchg_acquire(atomic_t *v, int old, int new) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_cmpxchg_acquire(v, old, new); } /** * atomic_cmpxchg_release() - atomic compare and exchange with release ordering * @v: pointer to atomic_t * @old: int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_cmpxchg_release() there. * * Return: The original value of @v. */ static __always_inline int atomic_cmpxchg_release(atomic_t *v, int old, int new) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_cmpxchg_release(v, old, new); } /** * atomic_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering * @v: pointer to atomic_t * @old: int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_cmpxchg_relaxed() there. * * Return: The original value of @v. */ static __always_inline int atomic_cmpxchg_relaxed(atomic_t *v, int old, int new) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_cmpxchg_relaxed(v, old, new); } /** * atomic_try_cmpxchg() - atomic compare and exchange with full ordering * @v: pointer to atomic_t * @old: pointer to int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. * Otherwise, updates @old to the current value of @v. * * Unsafe to use in noinstr code; use raw_atomic_try_cmpxchg() there. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool atomic_try_cmpxchg(atomic_t *v, int *old, int new) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); instrument_atomic_read_write(old, sizeof(*old)); return raw_atomic_try_cmpxchg(v, old, new); } /** * atomic_try_cmpxchg_acquire() - atomic compare and exchange with acquire ordering * @v: pointer to atomic_t * @old: pointer to int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. * Otherwise, updates @old to the current value of @v. * * Unsafe to use in noinstr code; use raw_atomic_try_cmpxchg_acquire() there. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new) { instrument_atomic_read_write(v, sizeof(*v)); instrument_atomic_read_write(old, sizeof(*old)); return raw_atomic_try_cmpxchg_acquire(v, old, new); } /** * atomic_try_cmpxchg_release() - atomic compare and exchange with release ordering * @v: pointer to atomic_t * @old: pointer to int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. * Otherwise, updates @old to the current value of @v. * * Unsafe to use in noinstr code; use raw_atomic_try_cmpxchg_release() there. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool atomic_try_cmpxchg_release(atomic_t *v, int *old, int new) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); instrument_atomic_read_write(old, sizeof(*old)); return raw_atomic_try_cmpxchg_release(v, old, new); } /** * atomic_try_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering * @v: pointer to atomic_t * @old: pointer to int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. * Otherwise, updates @old to the current value of @v. * * Unsafe to use in noinstr code; use raw_atomic_try_cmpxchg_relaxed() there. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool atomic_try_cmpxchg_relaxed(atomic_t *v, int *old, int new) { instrument_atomic_read_write(v, sizeof(*v)); instrument_atomic_read_write(old, sizeof(*old)); return raw_atomic_try_cmpxchg_relaxed(v, old, new); } /** * atomic_sub_and_test() - atomic subtract and test if zero with full ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_sub_and_test() there. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool atomic_sub_and_test(int i, atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_sub_and_test(i, v); } /** * atomic_dec_and_test() - atomic decrement and test if zero with full ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_dec_and_test() there. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool atomic_dec_and_test(atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_dec_and_test(v); } /** * atomic_inc_and_test() - atomic increment and test if zero with full ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_inc_and_test() there. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool atomic_inc_and_test(atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_inc_and_test(v); } /** * atomic_add_negative() - atomic add and test if negative with full ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_add_negative() there. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool atomic_add_negative(int i, atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_add_negative(i, v); } /** * atomic_add_negative_acquire() - atomic add and test if negative with acquire ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_add_negative_acquire() there. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool atomic_add_negative_acquire(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_add_negative_acquire(i, v); } /** * atomic_add_negative_release() - atomic add and test if negative with release ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_add_negative_release() there. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool atomic_add_negative_release(int i, atomic_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_add_negative_release(i, v); } /** * atomic_add_negative_relaxed() - atomic add and test if negative with relaxed ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_add_negative_relaxed() there. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool atomic_add_negative_relaxed(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_add_negative_relaxed(i, v); } /** * atomic_fetch_add_unless() - atomic add unless value with full ordering * @v: pointer to atomic_t * @a: int value to add * @u: int value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_add_unless() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_add_unless(atomic_t *v, int a, int u) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_add_unless(v, a, u); } /** * atomic_add_unless() - atomic add unless value with full ordering * @v: pointer to atomic_t * @a: int value to add * @u: int value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_add_unless() there. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool atomic_add_unless(atomic_t *v, int a, int u) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_add_unless(v, a, u); } /** * atomic_inc_not_zero() - atomic increment unless zero with full ordering * @v: pointer to atomic_t * * If (@v != 0), atomically updates @v to (@v + 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_inc_not_zero() there. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool atomic_inc_not_zero(atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_inc_not_zero(v); } /** * atomic_inc_unless_negative() - atomic increment unless negative with full ordering * @v: pointer to atomic_t * * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_inc_unless_negative() there. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool atomic_inc_unless_negative(atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_inc_unless_negative(v); } /** * atomic_dec_unless_positive() - atomic decrement unless positive with full ordering * @v: pointer to atomic_t * * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_dec_unless_positive() there. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool atomic_dec_unless_positive(atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_dec_unless_positive(v); } /** * atomic_dec_if_positive() - atomic decrement if positive with full ordering * @v: pointer to atomic_t * * If (@v > 0), atomically updates @v to (@v - 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_dec_if_positive() there. * * Return: The old value of (@v - 1), regardless of whether @v was updated. */ static __always_inline int atomic_dec_if_positive(atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_dec_if_positive(v); } /** * atomic64_read() - atomic load with relaxed ordering * @v: pointer to atomic64_t * * Atomically loads the value of @v with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_read() there. * * Return: The value loaded from @v. */ static __always_inline s64 atomic64_read(const atomic64_t *v) { instrument_atomic_read(v, sizeof(*v)); return raw_atomic64_read(v); } /** * atomic64_read_acquire() - atomic load with acquire ordering * @v: pointer to atomic64_t * * Atomically loads the value of @v with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic64_read_acquire() there. * * Return: The value loaded from @v. */ static __always_inline s64 atomic64_read_acquire(const atomic64_t *v) { instrument_atomic_read(v, sizeof(*v)); return raw_atomic64_read_acquire(v); } /** * atomic64_set() - atomic set with relaxed ordering * @v: pointer to atomic64_t * @i: s64 value to assign * * Atomically sets @v to @i with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_set() there. * * Return: Nothing. */ static __always_inline void atomic64_set(atomic64_t *v, s64 i) { instrument_atomic_write(v, sizeof(*v)); raw_atomic64_set(v, i); } /** * atomic64_set_release() - atomic set with release ordering * @v: pointer to atomic64_t * @i: s64 value to assign * * Atomically sets @v to @i with release ordering. * * Unsafe to use in noinstr code; use raw_atomic64_set_release() there. * * Return: Nothing. */ static __always_inline void atomic64_set_release(atomic64_t *v, s64 i) { kcsan_release(); instrument_atomic_write(v, sizeof(*v)); raw_atomic64_set_release(v, i); } /** * atomic64_add() - atomic add with relaxed ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_add() there. * * Return: Nothing. */ static __always_inline void atomic64_add(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic64_add(i, v); } /** * atomic64_add_return() - atomic add with full ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_add_return() there. * * Return: The updated value of @v. */ static __always_inline s64 atomic64_add_return(s64 i, atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_add_return(i, v); } /** * atomic64_add_return_acquire() - atomic add with acquire ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic64_add_return_acquire() there. * * Return: The updated value of @v. */ static __always_inline s64 atomic64_add_return_acquire(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_add_return_acquire(i, v); } /** * atomic64_add_return_release() - atomic add with release ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic64_add_return_release() there. * * Return: The updated value of @v. */ static __always_inline s64 atomic64_add_return_release(s64 i, atomic64_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_add_return_release(i, v); } /** * atomic64_add_return_relaxed() - atomic add with relaxed ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_add_return_relaxed() there. * * Return: The updated value of @v. */ static __always_inline s64 atomic64_add_return_relaxed(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_add_return_relaxed(i, v); } /** * atomic64_fetch_add() - atomic add with full ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_add() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_add(s64 i, atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_add(i, v); } /** * atomic64_fetch_add_acquire() - atomic add with acquire ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_add_acquire() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_add_acquire(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_add_acquire(i, v); } /** * atomic64_fetch_add_release() - atomic add with release ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_add_release() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_add_release(s64 i, atomic64_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_add_release(i, v); } /** * atomic64_fetch_add_relaxed() - atomic add with relaxed ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_add_relaxed() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_add_relaxed(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_add_relaxed(i, v); } /** * atomic64_sub() - atomic subtract with relaxed ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_sub() there. * * Return: Nothing. */ static __always_inline void atomic64_sub(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic64_sub(i, v); } /** * atomic64_sub_return() - atomic subtract with full ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_sub_return() there. * * Return: The updated value of @v. */ static __always_inline s64 atomic64_sub_return(s64 i, atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_sub_return(i, v); } /** * atomic64_sub_return_acquire() - atomic subtract with acquire ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic64_sub_return_acquire() there. * * Return: The updated value of @v. */ static __always_inline s64 atomic64_sub_return_acquire(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_sub_return_acquire(i, v); } /** * atomic64_sub_return_release() - atomic subtract with release ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic64_sub_return_release() there. * * Return: The updated value of @v. */ static __always_inline s64 atomic64_sub_return_release(s64 i, atomic64_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_sub_return_release(i, v); } /** * atomic64_sub_return_relaxed() - atomic subtract with relaxed ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_sub_return_relaxed() there. * * Return: The updated value of @v. */ static __always_inline s64 atomic64_sub_return_relaxed(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_sub_return_relaxed(i, v); } /** * atomic64_fetch_sub() - atomic subtract with full ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_sub() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_sub(s64 i, atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_sub(i, v); } /** * atomic64_fetch_sub_acquire() - atomic subtract with acquire ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_sub_acquire() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_sub_acquire(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_sub_acquire(i, v); } /** * atomic64_fetch_sub_release() - atomic subtract with release ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_sub_release() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_sub_release(s64 i, atomic64_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_sub_release(i, v); } /** * atomic64_fetch_sub_relaxed() - atomic subtract with relaxed ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_sub_relaxed() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_sub_relaxed(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_sub_relaxed(i, v); } /** * atomic64_inc() - atomic increment with relaxed ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_inc() there. * * Return: Nothing. */ static __always_inline void atomic64_inc(atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic64_inc(v); } /** * atomic64_inc_return() - atomic increment with full ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_inc_return() there. * * Return: The updated value of @v. */ static __always_inline s64 atomic64_inc_return(atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_inc_return(v); } /** * atomic64_inc_return_acquire() - atomic increment with acquire ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic64_inc_return_acquire() there. * * Return: The updated value of @v. */ static __always_inline s64 atomic64_inc_return_acquire(atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_inc_return_acquire(v); } /** * atomic64_inc_return_release() - atomic increment with release ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic64_inc_return_release() there. * * Return: The updated value of @v. */ static __always_inline s64 atomic64_inc_return_release(atomic64_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_inc_return_release(v); } /** * atomic64_inc_return_relaxed() - atomic increment with relaxed ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_inc_return_relaxed() there. * * Return: The updated value of @v. */ static __always_inline s64 atomic64_inc_return_relaxed(atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_inc_return_relaxed(v); } /** * atomic64_fetch_inc() - atomic increment with full ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_inc() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_inc(atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_inc(v); } /** * atomic64_fetch_inc_acquire() - atomic increment with acquire ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_inc_acquire() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_inc_acquire(atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_inc_acquire(v); } /** * atomic64_fetch_inc_release() - atomic increment with release ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_inc_release() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_inc_release(atomic64_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_inc_release(v); } /** * atomic64_fetch_inc_relaxed() - atomic increment with relaxed ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_inc_relaxed() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_inc_relaxed(atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_inc_relaxed(v); } /** * atomic64_dec() - atomic decrement with relaxed ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_dec() there. * * Return: Nothing. */ static __always_inline void atomic64_dec(atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic64_dec(v); } /** * atomic64_dec_return() - atomic decrement with full ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_dec_return() there. * * Return: The updated value of @v. */ static __always_inline s64 atomic64_dec_return(atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_dec_return(v); } /** * atomic64_dec_return_acquire() - atomic decrement with acquire ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic64_dec_return_acquire() there. * * Return: The updated value of @v. */ static __always_inline s64 atomic64_dec_return_acquire(atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_dec_return_acquire(v); } /** * atomic64_dec_return_release() - atomic decrement with release ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic64_dec_return_release() there. * * Return: The updated value of @v. */ static __always_inline s64 atomic64_dec_return_release(atomic64_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_dec_return_release(v); } /** * atomic64_dec_return_relaxed() - atomic decrement with relaxed ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_dec_return_relaxed() there. * * Return: The updated value of @v. */ static __always_inline s64 atomic64_dec_return_relaxed(atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_dec_return_relaxed(v); } /** * atomic64_fetch_dec() - atomic decrement with full ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_dec() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_dec(atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_dec(v); } /** * atomic64_fetch_dec_acquire() - atomic decrement with acquire ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_dec_acquire() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_dec_acquire(atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_dec_acquire(v); } /** * atomic64_fetch_dec_release() - atomic decrement with release ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_dec_release() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_dec_release(atomic64_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_dec_release(v); } /** * atomic64_fetch_dec_relaxed() - atomic decrement with relaxed ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_dec_relaxed() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_dec_relaxed(atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_dec_relaxed(v); } /** * atomic64_and() - atomic bitwise AND with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_and() there. * * Return: Nothing. */ static __always_inline void atomic64_and(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic64_and(i, v); } /** * atomic64_fetch_and() - atomic bitwise AND with full ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_and() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_and(s64 i, atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_and(i, v); } /** * atomic64_fetch_and_acquire() - atomic bitwise AND with acquire ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_and_acquire() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_and_acquire(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_and_acquire(i, v); } /** * atomic64_fetch_and_release() - atomic bitwise AND with release ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_and_release() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_and_release(s64 i, atomic64_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_and_release(i, v); } /** * atomic64_fetch_and_relaxed() - atomic bitwise AND with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_and_relaxed() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_and_relaxed(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_and_relaxed(i, v); } /** * atomic64_andnot() - atomic bitwise AND NOT with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & ~@i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_andnot() there. * * Return: Nothing. */ static __always_inline void atomic64_andnot(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic64_andnot(i, v); } /** * atomic64_fetch_andnot() - atomic bitwise AND NOT with full ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & ~@i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_andnot() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_andnot(s64 i, atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_andnot(i, v); } /** * atomic64_fetch_andnot_acquire() - atomic bitwise AND NOT with acquire ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & ~@i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_andnot_acquire() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_andnot_acquire(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_andnot_acquire(i, v); } /** * atomic64_fetch_andnot_release() - atomic bitwise AND NOT with release ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & ~@i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_andnot_release() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_andnot_release(s64 i, atomic64_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_andnot_release(i, v); } /** * atomic64_fetch_andnot_relaxed() - atomic bitwise AND NOT with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & ~@i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_andnot_relaxed() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_andnot_relaxed(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_andnot_relaxed(i, v); } /** * atomic64_or() - atomic bitwise OR with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v | @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_or() there. * * Return: Nothing. */ static __always_inline void atomic64_or(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic64_or(i, v); } /** * atomic64_fetch_or() - atomic bitwise OR with full ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v | @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_or() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_or(s64 i, atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_or(i, v); } /** * atomic64_fetch_or_acquire() - atomic bitwise OR with acquire ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v | @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_or_acquire() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_or_acquire(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_or_acquire(i, v); } /** * atomic64_fetch_or_release() - atomic bitwise OR with release ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v | @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_or_release() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_or_release(s64 i, atomic64_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_or_release(i, v); } /** * atomic64_fetch_or_relaxed() - atomic bitwise OR with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v | @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_or_relaxed() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_or_relaxed(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_or_relaxed(i, v); } /** * atomic64_xor() - atomic bitwise XOR with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v ^ @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_xor() there. * * Return: Nothing. */ static __always_inline void atomic64_xor(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic64_xor(i, v); } /** * atomic64_fetch_xor() - atomic bitwise XOR with full ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v ^ @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_xor() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_xor(s64 i, atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_xor(i, v); } /** * atomic64_fetch_xor_acquire() - atomic bitwise XOR with acquire ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v ^ @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_xor_acquire() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_xor_acquire(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_xor_acquire(i, v); } /** * atomic64_fetch_xor_release() - atomic bitwise XOR with release ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v ^ @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_xor_release() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_xor_release(s64 i, atomic64_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_xor_release(i, v); } /** * atomic64_fetch_xor_relaxed() - atomic bitwise XOR with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v ^ @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_xor_relaxed() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_xor_relaxed(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_xor_relaxed(i, v); } /** * atomic64_xchg() - atomic exchange with full ordering * @v: pointer to atomic64_t * @new: s64 value to assign * * Atomically updates @v to @new with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_xchg() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_xchg(atomic64_t *v, s64 new) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_xchg(v, new); } /** * atomic64_xchg_acquire() - atomic exchange with acquire ordering * @v: pointer to atomic64_t * @new: s64 value to assign * * Atomically updates @v to @new with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic64_xchg_acquire() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_xchg_acquire(atomic64_t *v, s64 new) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_xchg_acquire(v, new); } /** * atomic64_xchg_release() - atomic exchange with release ordering * @v: pointer to atomic64_t * @new: s64 value to assign * * Atomically updates @v to @new with release ordering. * * Unsafe to use in noinstr code; use raw_atomic64_xchg_release() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_xchg_release(atomic64_t *v, s64 new) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_xchg_release(v, new); } /** * atomic64_xchg_relaxed() - atomic exchange with relaxed ordering * @v: pointer to atomic64_t * @new: s64 value to assign * * Atomically updates @v to @new with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_xchg_relaxed() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_xchg_relaxed(atomic64_t *v, s64 new) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_xchg_relaxed(v, new); } /** * atomic64_cmpxchg() - atomic compare and exchange with full ordering * @v: pointer to atomic64_t * @old: s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_cmpxchg() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_cmpxchg(v, old, new); } /** * atomic64_cmpxchg_acquire() - atomic compare and exchange with acquire ordering * @v: pointer to atomic64_t * @old: s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic64_cmpxchg_acquire() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_cmpxchg_acquire(atomic64_t *v, s64 old, s64 new) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_cmpxchg_acquire(v, old, new); } /** * atomic64_cmpxchg_release() - atomic compare and exchange with release ordering * @v: pointer to atomic64_t * @old: s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. * * Unsafe to use in noinstr code; use raw_atomic64_cmpxchg_release() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_cmpxchg_release(atomic64_t *v, s64 old, s64 new) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_cmpxchg_release(v, old, new); } /** * atomic64_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering * @v: pointer to atomic64_t * @old: s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_cmpxchg_relaxed() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_cmpxchg_relaxed(atomic64_t *v, s64 old, s64 new) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_cmpxchg_relaxed(v, old, new); } /** * atomic64_try_cmpxchg() - atomic compare and exchange with full ordering * @v: pointer to atomic64_t * @old: pointer to s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. * Otherwise, updates @old to the current value of @v. * * Unsafe to use in noinstr code; use raw_atomic64_try_cmpxchg() there. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); instrument_atomic_read_write(old, sizeof(*old)); return raw_atomic64_try_cmpxchg(v, old, new); } /** * atomic64_try_cmpxchg_acquire() - atomic compare and exchange with acquire ordering * @v: pointer to atomic64_t * @old: pointer to s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. * Otherwise, updates @old to the current value of @v. * * Unsafe to use in noinstr code; use raw_atomic64_try_cmpxchg_acquire() there. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new) { instrument_atomic_read_write(v, sizeof(*v)); instrument_atomic_read_write(old, sizeof(*old)); return raw_atomic64_try_cmpxchg_acquire(v, old, new); } /** * atomic64_try_cmpxchg_release() - atomic compare and exchange with release ordering * @v: pointer to atomic64_t * @old: pointer to s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. * Otherwise, updates @old to the current value of @v. * * Unsafe to use in noinstr code; use raw_atomic64_try_cmpxchg_release() there. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); instrument_atomic_read_write(old, sizeof(*old)); return raw_atomic64_try_cmpxchg_release(v, old, new); } /** * atomic64_try_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering * @v: pointer to atomic64_t * @old: pointer to s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. * Otherwise, updates @old to the current value of @v. * * Unsafe to use in noinstr code; use raw_atomic64_try_cmpxchg_relaxed() there. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool atomic64_try_cmpxchg_relaxed(atomic64_t *v, s64 *old, s64 new) { instrument_atomic_read_write(v, sizeof(*v)); instrument_atomic_read_write(old, sizeof(*old)); return raw_atomic64_try_cmpxchg_relaxed(v, old, new); } /** * atomic64_sub_and_test() - atomic subtract and test if zero with full ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_sub_and_test() there. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool atomic64_sub_and_test(s64 i, atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_sub_and_test(i, v); } /** * atomic64_dec_and_test() - atomic decrement and test if zero with full ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_dec_and_test() there. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool atomic64_dec_and_test(atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_dec_and_test(v); } /** * atomic64_inc_and_test() - atomic increment and test if zero with full ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_inc_and_test() there. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool atomic64_inc_and_test(atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_inc_and_test(v); } /** * atomic64_add_negative() - atomic add and test if negative with full ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_add_negative() there. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool atomic64_add_negative(s64 i, atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_add_negative(i, v); } /** * atomic64_add_negative_acquire() - atomic add and test if negative with acquire ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic64_add_negative_acquire() there. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool atomic64_add_negative_acquire(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_add_negative_acquire(i, v); } /** * atomic64_add_negative_release() - atomic add and test if negative with release ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic64_add_negative_release() there. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool atomic64_add_negative_release(s64 i, atomic64_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_add_negative_release(i, v); } /** * atomic64_add_negative_relaxed() - atomic add and test if negative with relaxed ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_add_negative_relaxed() there. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool atomic64_add_negative_relaxed(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_add_negative_relaxed(i, v); } /** * atomic64_fetch_add_unless() - atomic add unless value with full ordering * @v: pointer to atomic64_t * @a: s64 value to add * @u: s64 value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_add_unless() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_add_unless(v, a, u); } /** * atomic64_add_unless() - atomic add unless value with full ordering * @v: pointer to atomic64_t * @a: s64 value to add * @u: s64 value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_add_unless() there. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool atomic64_add_unless(atomic64_t *v, s64 a, s64 u) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_add_unless(v, a, u); } /** * atomic64_inc_not_zero() - atomic increment unless zero with full ordering * @v: pointer to atomic64_t * * If (@v != 0), atomically updates @v to (@v + 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_inc_not_zero() there. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool atomic64_inc_not_zero(atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_inc_not_zero(v); } /** * atomic64_inc_unless_negative() - atomic increment unless negative with full ordering * @v: pointer to atomic64_t * * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_inc_unless_negative() there. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool atomic64_inc_unless_negative(atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_inc_unless_negative(v); } /** * atomic64_dec_unless_positive() - atomic decrement unless positive with full ordering * @v: pointer to atomic64_t * * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_dec_unless_positive() there. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool atomic64_dec_unless_positive(atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_dec_unless_positive(v); } /** * atomic64_dec_if_positive() - atomic decrement if positive with full ordering * @v: pointer to atomic64_t * * If (@v > 0), atomically updates @v to (@v - 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_dec_if_positive() there. * * Return: The old value of (@v - 1), regardless of whether @v was updated. */ static __always_inline s64 atomic64_dec_if_positive(atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_dec_if_positive(v); } /** * atomic_long_read() - atomic load with relaxed ordering * @v: pointer to atomic_long_t * * Atomically loads the value of @v with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_read() there. * * Return: The value loaded from @v. */ static __always_inline long atomic_long_read(const atomic_long_t *v) { instrument_atomic_read(v, sizeof(*v)); return raw_atomic_long_read(v); } /** * atomic_long_read_acquire() - atomic load with acquire ordering * @v: pointer to atomic_long_t * * Atomically loads the value of @v with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_read_acquire() there. * * Return: The value loaded from @v. */ static __always_inline long atomic_long_read_acquire(const atomic_long_t *v) { instrument_atomic_read(v, sizeof(*v)); return raw_atomic_long_read_acquire(v); } /** * atomic_long_set() - atomic set with relaxed ordering * @v: pointer to atomic_long_t * @i: long value to assign * * Atomically sets @v to @i with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_set() there. * * Return: Nothing. */ static __always_inline void atomic_long_set(atomic_long_t *v, long i) { instrument_atomic_write(v, sizeof(*v)); raw_atomic_long_set(v, i); } /** * atomic_long_set_release() - atomic set with release ordering * @v: pointer to atomic_long_t * @i: long value to assign * * Atomically sets @v to @i with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_set_release() there. * * Return: Nothing. */ static __always_inline void atomic_long_set_release(atomic_long_t *v, long i) { kcsan_release(); instrument_atomic_write(v, sizeof(*v)); raw_atomic_long_set_release(v, i); } /** * atomic_long_add() - atomic add with relaxed ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_add() there. * * Return: Nothing. */ static __always_inline void atomic_long_add(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic_long_add(i, v); } /** * atomic_long_add_return() - atomic add with full ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_add_return() there. * * Return: The updated value of @v. */ static __always_inline long atomic_long_add_return(long i, atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_add_return(i, v); } /** * atomic_long_add_return_acquire() - atomic add with acquire ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_add_return_acquire() there. * * Return: The updated value of @v. */ static __always_inline long atomic_long_add_return_acquire(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_add_return_acquire(i, v); } /** * atomic_long_add_return_release() - atomic add with release ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_add_return_release() there. * * Return: The updated value of @v. */ static __always_inline long atomic_long_add_return_release(long i, atomic_long_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_add_return_release(i, v); } /** * atomic_long_add_return_relaxed() - atomic add with relaxed ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_add_return_relaxed() there. * * Return: The updated value of @v. */ static __always_inline long atomic_long_add_return_relaxed(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_add_return_relaxed(i, v); } /** * atomic_long_fetch_add() - atomic add with full ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_add() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_add(long i, atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_add(i, v); } /** * atomic_long_fetch_add_acquire() - atomic add with acquire ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_add_acquire() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_add_acquire(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_add_acquire(i, v); } /** * atomic_long_fetch_add_release() - atomic add with release ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_add_release() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_add_release(long i, atomic_long_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_add_release(i, v); } /** * atomic_long_fetch_add_relaxed() - atomic add with relaxed ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_add_relaxed() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_add_relaxed(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_add_relaxed(i, v); } /** * atomic_long_sub() - atomic subtract with relaxed ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_sub() there. * * Return: Nothing. */ static __always_inline void atomic_long_sub(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic_long_sub(i, v); } /** * atomic_long_sub_return() - atomic subtract with full ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_sub_return() there. * * Return: The updated value of @v. */ static __always_inline long atomic_long_sub_return(long i, atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_sub_return(i, v); } /** * atomic_long_sub_return_acquire() - atomic subtract with acquire ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_sub_return_acquire() there. * * Return: The updated value of @v. */ static __always_inline long atomic_long_sub_return_acquire(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_sub_return_acquire(i, v); } /** * atomic_long_sub_return_release() - atomic subtract with release ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_sub_return_release() there. * * Return: The updated value of @v. */ static __always_inline long atomic_long_sub_return_release(long i, atomic_long_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_sub_return_release(i, v); } /** * atomic_long_sub_return_relaxed() - atomic subtract with relaxed ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_sub_return_relaxed() there. * * Return: The updated value of @v. */ static __always_inline long atomic_long_sub_return_relaxed(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_sub_return_relaxed(i, v); } /** * atomic_long_fetch_sub() - atomic subtract with full ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_sub() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_sub(long i, atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_sub(i, v); } /** * atomic_long_fetch_sub_acquire() - atomic subtract with acquire ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_sub_acquire() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_sub_acquire(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_sub_acquire(i, v); } /** * atomic_long_fetch_sub_release() - atomic subtract with release ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_sub_release() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_sub_release(long i, atomic_long_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_sub_release(i, v); } /** * atomic_long_fetch_sub_relaxed() - atomic subtract with relaxed ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_sub_relaxed() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_sub_relaxed(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_sub_relaxed(i, v); } /** * atomic_long_inc() - atomic increment with relaxed ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_inc() there. * * Return: Nothing. */ static __always_inline void atomic_long_inc(atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic_long_inc(v); } /** * atomic_long_inc_return() - atomic increment with full ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_inc_return() there. * * Return: The updated value of @v. */ static __always_inline long atomic_long_inc_return(atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_inc_return(v); } /** * atomic_long_inc_return_acquire() - atomic increment with acquire ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_inc_return_acquire() there. * * Return: The updated value of @v. */ static __always_inline long atomic_long_inc_return_acquire(atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_inc_return_acquire(v); } /** * atomic_long_inc_return_release() - atomic increment with release ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_inc_return_release() there. * * Return: The updated value of @v. */ static __always_inline long atomic_long_inc_return_release(atomic_long_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_inc_return_release(v); } /** * atomic_long_inc_return_relaxed() - atomic increment with relaxed ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_inc_return_relaxed() there. * * Return: The updated value of @v. */ static __always_inline long atomic_long_inc_return_relaxed(atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_inc_return_relaxed(v); } /** * atomic_long_fetch_inc() - atomic increment with full ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_inc() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_inc(atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_inc(v); } /** * atomic_long_fetch_inc_acquire() - atomic increment with acquire ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_inc_acquire() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_inc_acquire(atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_inc_acquire(v); } /** * atomic_long_fetch_inc_release() - atomic increment with release ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_inc_release() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_inc_release(atomic_long_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_inc_release(v); } /** * atomic_long_fetch_inc_relaxed() - atomic increment with relaxed ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_inc_relaxed() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_inc_relaxed(atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_inc_relaxed(v); } /** * atomic_long_dec() - atomic decrement with relaxed ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_dec() there. * * Return: Nothing. */ static __always_inline void atomic_long_dec(atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic_long_dec(v); } /** * atomic_long_dec_return() - atomic decrement with full ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_dec_return() there. * * Return: The updated value of @v. */ static __always_inline long atomic_long_dec_return(atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_dec_return(v); } /** * atomic_long_dec_return_acquire() - atomic decrement with acquire ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_dec_return_acquire() there. * * Return: The updated value of @v. */ static __always_inline long atomic_long_dec_return_acquire(atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_dec_return_acquire(v); } /** * atomic_long_dec_return_release() - atomic decrement with release ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_dec_return_release() there. * * Return: The updated value of @v. */ static __always_inline long atomic_long_dec_return_release(atomic_long_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_dec_return_release(v); } /** * atomic_long_dec_return_relaxed() - atomic decrement with relaxed ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_dec_return_relaxed() there. * * Return: The updated value of @v. */ static __always_inline long atomic_long_dec_return_relaxed(atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_dec_return_relaxed(v); } /** * atomic_long_fetch_dec() - atomic decrement with full ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_dec() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_dec(atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_dec(v); } /** * atomic_long_fetch_dec_acquire() - atomic decrement with acquire ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_dec_acquire() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_dec_acquire(atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_dec_acquire(v); } /** * atomic_long_fetch_dec_release() - atomic decrement with release ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_dec_release() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_dec_release(atomic_long_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_dec_release(v); } /** * atomic_long_fetch_dec_relaxed() - atomic decrement with relaxed ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_dec_relaxed() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_dec_relaxed(atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_dec_relaxed(v); } /** * atomic_long_and() - atomic bitwise AND with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_and() there. * * Return: Nothing. */ static __always_inline void atomic_long_and(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic_long_and(i, v); } /** * atomic_long_fetch_and() - atomic bitwise AND with full ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_and() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_and(long i, atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_and(i, v); } /** * atomic_long_fetch_and_acquire() - atomic bitwise AND with acquire ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_and_acquire() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_and_acquire(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_and_acquire(i, v); } /** * atomic_long_fetch_and_release() - atomic bitwise AND with release ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_and_release() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_and_release(long i, atomic_long_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_and_release(i, v); } /** * atomic_long_fetch_and_relaxed() - atomic bitwise AND with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_and_relaxed() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_and_relaxed(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_and_relaxed(i, v); } /** * atomic_long_andnot() - atomic bitwise AND NOT with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & ~@i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_andnot() there. * * Return: Nothing. */ static __always_inline void atomic_long_andnot(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic_long_andnot(i, v); } /** * atomic_long_fetch_andnot() - atomic bitwise AND NOT with full ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & ~@i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_andnot() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_andnot(long i, atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_andnot(i, v); } /** * atomic_long_fetch_andnot_acquire() - atomic bitwise AND NOT with acquire ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & ~@i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_andnot_acquire() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_andnot_acquire(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_andnot_acquire(i, v); } /** * atomic_long_fetch_andnot_release() - atomic bitwise AND NOT with release ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & ~@i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_andnot_release() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_andnot_release(long i, atomic_long_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_andnot_release(i, v); } /** * atomic_long_fetch_andnot_relaxed() - atomic bitwise AND NOT with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & ~@i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_andnot_relaxed() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_andnot_relaxed(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_andnot_relaxed(i, v); } /** * atomic_long_or() - atomic bitwise OR with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v | @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_or() there. * * Return: Nothing. */ static __always_inline void atomic_long_or(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic_long_or(i, v); } /** * atomic_long_fetch_or() - atomic bitwise OR with full ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v | @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_or() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_or(long i, atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_or(i, v); } /** * atomic_long_fetch_or_acquire() - atomic bitwise OR with acquire ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v | @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_or_acquire() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_or_acquire(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_or_acquire(i, v); } /** * atomic_long_fetch_or_release() - atomic bitwise OR with release ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v | @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_or_release() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_or_release(long i, atomic_long_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_or_release(i, v); } /** * atomic_long_fetch_or_relaxed() - atomic bitwise OR with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v | @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_or_relaxed() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_or_relaxed(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_or_relaxed(i, v); } /** * atomic_long_xor() - atomic bitwise XOR with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v ^ @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_xor() there. * * Return: Nothing. */ static __always_inline void atomic_long_xor(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic_long_xor(i, v); } /** * atomic_long_fetch_xor() - atomic bitwise XOR with full ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v ^ @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_xor() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_xor(long i, atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_xor(i, v); } /** * atomic_long_fetch_xor_acquire() - atomic bitwise XOR with acquire ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v ^ @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_xor_acquire() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_xor_acquire(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_xor_acquire(i, v); } /** * atomic_long_fetch_xor_release() - atomic bitwise XOR with release ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v ^ @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_xor_release() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_xor_release(long i, atomic_long_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_xor_release(i, v); } /** * atomic_long_fetch_xor_relaxed() - atomic bitwise XOR with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v ^ @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_xor_relaxed() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_xor_relaxed(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_xor_relaxed(i, v); } /** * atomic_long_xchg() - atomic exchange with full ordering * @v: pointer to atomic_long_t * @new: long value to assign * * Atomically updates @v to @new with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_xchg() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_xchg(atomic_long_t *v, long new) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_xchg(v, new); } /** * atomic_long_xchg_acquire() - atomic exchange with acquire ordering * @v: pointer to atomic_long_t * @new: long value to assign * * Atomically updates @v to @new with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_xchg_acquire() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_xchg_acquire(atomic_long_t *v, long new) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_xchg_acquire(v, new); } /** * atomic_long_xchg_release() - atomic exchange with release ordering * @v: pointer to atomic_long_t * @new: long value to assign * * Atomically updates @v to @new with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_xchg_release() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_xchg_release(atomic_long_t *v, long new) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_xchg_release(v, new); } /** * atomic_long_xchg_relaxed() - atomic exchange with relaxed ordering * @v: pointer to atomic_long_t * @new: long value to assign * * Atomically updates @v to @new with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_xchg_relaxed() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_xchg_relaxed(atomic_long_t *v, long new) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_xchg_relaxed(v, new); } /** * atomic_long_cmpxchg() - atomic compare and exchange with full ordering * @v: pointer to atomic_long_t * @old: long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_cmpxchg() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_cmpxchg(atomic_long_t *v, long old, long new) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_cmpxchg(v, old, new); } /** * atomic_long_cmpxchg_acquire() - atomic compare and exchange with acquire ordering * @v: pointer to atomic_long_t * @old: long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_cmpxchg_acquire() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_cmpxchg_acquire(atomic_long_t *v, long old, long new) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_cmpxchg_acquire(v, old, new); } /** * atomic_long_cmpxchg_release() - atomic compare and exchange with release ordering * @v: pointer to atomic_long_t * @old: long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_cmpxchg_release() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_cmpxchg_release(atomic_long_t *v, long old, long new) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_cmpxchg_release(v, old, new); } /** * atomic_long_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering * @v: pointer to atomic_long_t * @old: long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_cmpxchg_relaxed() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_cmpxchg_relaxed(atomic_long_t *v, long old, long new) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_cmpxchg_relaxed(v, old, new); } /** * atomic_long_try_cmpxchg() - atomic compare and exchange with full ordering * @v: pointer to atomic_long_t * @old: pointer to long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. * Otherwise, updates @old to the current value of @v. * * Unsafe to use in noinstr code; use raw_atomic_long_try_cmpxchg() there. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool atomic_long_try_cmpxchg(atomic_long_t *v, long *old, long new) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); instrument_atomic_read_write(old, sizeof(*old)); return raw_atomic_long_try_cmpxchg(v, old, new); } /** * atomic_long_try_cmpxchg_acquire() - atomic compare and exchange with acquire ordering * @v: pointer to atomic_long_t * @old: pointer to long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. * Otherwise, updates @old to the current value of @v. * * Unsafe to use in noinstr code; use raw_atomic_long_try_cmpxchg_acquire() there. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool atomic_long_try_cmpxchg_acquire(atomic_long_t *v, long *old, long new) { instrument_atomic_read_write(v, sizeof(*v)); instrument_atomic_read_write(old, sizeof(*old)); return raw_atomic_long_try_cmpxchg_acquire(v, old, new); } /** * atomic_long_try_cmpxchg_release() - atomic compare and exchange with release ordering * @v: pointer to atomic_long_t * @old: pointer to long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. * Otherwise, updates @old to the current value of @v. * * Unsafe to use in noinstr code; use raw_atomic_long_try_cmpxchg_release() there. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool atomic_long_try_cmpxchg_release(atomic_long_t *v, long *old, long new) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); instrument_atomic_read_write(old, sizeof(*old)); return raw_atomic_long_try_cmpxchg_release(v, old, new); } /** * atomic_long_try_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering * @v: pointer to atomic_long_t * @old: pointer to long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. * Otherwise, updates @old to the current value of @v. * * Unsafe to use in noinstr code; use raw_atomic_long_try_cmpxchg_relaxed() there. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool atomic_long_try_cmpxchg_relaxed(atomic_long_t *v, long *old, long new) { instrument_atomic_read_write(v, sizeof(*v)); instrument_atomic_read_write(old, sizeof(*old)); return raw_atomic_long_try_cmpxchg_relaxed(v, old, new); } /** * atomic_long_sub_and_test() - atomic subtract and test if zero with full ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_sub_and_test() there. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool atomic_long_sub_and_test(long i, atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_sub_and_test(i, v); } /** * atomic_long_dec_and_test() - atomic decrement and test if zero with full ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_dec_and_test() there. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool atomic_long_dec_and_test(atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_dec_and_test(v); } /** * atomic_long_inc_and_test() - atomic increment and test if zero with full ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_inc_and_test() there. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool atomic_long_inc_and_test(atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_inc_and_test(v); } /** * atomic_long_add_negative() - atomic add and test if negative with full ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_add_negative() there. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool atomic_long_add_negative(long i, atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_add_negative(i, v); } /** * atomic_long_add_negative_acquire() - atomic add and test if negative with acquire ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_add_negative_acquire() there. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool atomic_long_add_negative_acquire(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_add_negative_acquire(i, v); } /** * atomic_long_add_negative_release() - atomic add and test if negative with release ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_add_negative_release() there. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool atomic_long_add_negative_release(long i, atomic_long_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_add_negative_release(i, v); } /** * atomic_long_add_negative_relaxed() - atomic add and test if negative with relaxed ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_add_negative_relaxed() there. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool atomic_long_add_negative_relaxed(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_add_negative_relaxed(i, v); } /** * atomic_long_fetch_add_unless() - atomic add unless value with full ordering * @v: pointer to atomic_long_t * @a: long value to add * @u: long value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_add_unless() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_add_unless(atomic_long_t *v, long a, long u) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_add_unless(v, a, u); } /** * atomic_long_add_unless() - atomic add unless value with full ordering * @v: pointer to atomic_long_t * @a: long value to add * @u: long value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_add_unless() there. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool atomic_long_add_unless(atomic_long_t *v, long a, long u) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_add_unless(v, a, u); } /** * atomic_long_inc_not_zero() - atomic increment unless zero with full ordering * @v: pointer to atomic_long_t * * If (@v != 0), atomically updates @v to (@v + 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_inc_not_zero() there. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool atomic_long_inc_not_zero(atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_inc_not_zero(v); } /** * atomic_long_inc_unless_negative() - atomic increment unless negative with full ordering * @v: pointer to atomic_long_t * * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_inc_unless_negative() there. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool atomic_long_inc_unless_negative(atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_inc_unless_negative(v); } /** * atomic_long_dec_unless_positive() - atomic decrement unless positive with full ordering * @v: pointer to atomic_long_t * * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_dec_unless_positive() there. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool atomic_long_dec_unless_positive(atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_dec_unless_positive(v); } /** * atomic_long_dec_if_positive() - atomic decrement if positive with full ordering * @v: pointer to atomic_long_t * * If (@v > 0), atomically updates @v to (@v - 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_dec_if_positive() there. * * Return: The old value of (@v - 1), regardless of whether @v was updated. */ static __always_inline long atomic_long_dec_if_positive(atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_dec_if_positive(v); } #define xchg(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ kcsan_mb(); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_xchg(__ai_ptr, __VA_ARGS__); \ }) #define xchg_acquire(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_xchg_acquire(__ai_ptr, __VA_ARGS__); \ }) #define xchg_release(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ kcsan_release(); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_xchg_release(__ai_ptr, __VA_ARGS__); \ }) #define xchg_relaxed(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_xchg_relaxed(__ai_ptr, __VA_ARGS__); \ }) #define cmpxchg(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ kcsan_mb(); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_cmpxchg(__ai_ptr, __VA_ARGS__); \ }) #define cmpxchg_acquire(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_cmpxchg_acquire(__ai_ptr, __VA_ARGS__); \ }) #define cmpxchg_release(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ kcsan_release(); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_cmpxchg_release(__ai_ptr, __VA_ARGS__); \ }) #define cmpxchg_relaxed(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_cmpxchg_relaxed(__ai_ptr, __VA_ARGS__); \ }) #define cmpxchg64(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ kcsan_mb(); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_cmpxchg64(__ai_ptr, __VA_ARGS__); \ }) #define cmpxchg64_acquire(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_cmpxchg64_acquire(__ai_ptr, __VA_ARGS__); \ }) #define cmpxchg64_release(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ kcsan_release(); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_cmpxchg64_release(__ai_ptr, __VA_ARGS__); \ }) #define cmpxchg64_relaxed(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_cmpxchg64_relaxed(__ai_ptr, __VA_ARGS__); \ }) #define cmpxchg128(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ kcsan_mb(); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_cmpxchg128(__ai_ptr, __VA_ARGS__); \ }) #define cmpxchg128_acquire(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_cmpxchg128_acquire(__ai_ptr, __VA_ARGS__); \ }) #define cmpxchg128_release(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ kcsan_release(); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_cmpxchg128_release(__ai_ptr, __VA_ARGS__); \ }) #define cmpxchg128_relaxed(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_cmpxchg128_relaxed(__ai_ptr, __VA_ARGS__); \ }) #define try_cmpxchg(ptr, oldp, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ typeof(oldp) __ai_oldp = (oldp); \ kcsan_mb(); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \ raw_try_cmpxchg(__ai_ptr, __ai_oldp, __VA_ARGS__); \ }) #define try_cmpxchg_acquire(ptr, oldp, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ typeof(oldp) __ai_oldp = (oldp); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \ raw_try_cmpxchg_acquire(__ai_ptr, __ai_oldp, __VA_ARGS__); \ }) #define try_cmpxchg_release(ptr, oldp, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ typeof(oldp) __ai_oldp = (oldp); \ kcsan_release(); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \ raw_try_cmpxchg_release(__ai_ptr, __ai_oldp, __VA_ARGS__); \ }) #define try_cmpxchg_relaxed(ptr, oldp, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ typeof(oldp) __ai_oldp = (oldp); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \ raw_try_cmpxchg_relaxed(__ai_ptr, __ai_oldp, __VA_ARGS__); \ }) #define try_cmpxchg64(ptr, oldp, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ typeof(oldp) __ai_oldp = (oldp); \ kcsan_mb(); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \ raw_try_cmpxchg64(__ai_ptr, __ai_oldp, __VA_ARGS__); \ }) #define try_cmpxchg64_acquire(ptr, oldp, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ typeof(oldp) __ai_oldp = (oldp); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \ raw_try_cmpxchg64_acquire(__ai_ptr, __ai_oldp, __VA_ARGS__); \ }) #define try_cmpxchg64_release(ptr, oldp, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ typeof(oldp) __ai_oldp = (oldp); \ kcsan_release(); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \ raw_try_cmpxchg64_release(__ai_ptr, __ai_oldp, __VA_ARGS__); \ }) #define try_cmpxchg64_relaxed(ptr, oldp, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ typeof(oldp) __ai_oldp = (oldp); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \ raw_try_cmpxchg64_relaxed(__ai_ptr, __ai_oldp, __VA_ARGS__); \ }) #define try_cmpxchg128(ptr, oldp, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ typeof(oldp) __ai_oldp = (oldp); \ kcsan_mb(); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \ raw_try_cmpxchg128(__ai_ptr, __ai_oldp, __VA_ARGS__); \ }) #define try_cmpxchg128_acquire(ptr, oldp, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ typeof(oldp) __ai_oldp = (oldp); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \ raw_try_cmpxchg128_acquire(__ai_ptr, __ai_oldp, __VA_ARGS__); \ }) #define try_cmpxchg128_release(ptr, oldp, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ typeof(oldp) __ai_oldp = (oldp); \ kcsan_release(); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \ raw_try_cmpxchg128_release(__ai_ptr, __ai_oldp, __VA_ARGS__); \ }) #define try_cmpxchg128_relaxed(ptr, oldp, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ typeof(oldp) __ai_oldp = (oldp); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \ raw_try_cmpxchg128_relaxed(__ai_ptr, __ai_oldp, __VA_ARGS__); \ }) #define cmpxchg_local(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_cmpxchg_local(__ai_ptr, __VA_ARGS__); \ }) #define cmpxchg64_local(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_cmpxchg64_local(__ai_ptr, __VA_ARGS__); \ }) #define cmpxchg128_local(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_cmpxchg128_local(__ai_ptr, __VA_ARGS__); \ }) #define sync_cmpxchg(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ kcsan_mb(); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_sync_cmpxchg(__ai_ptr, __VA_ARGS__); \ }) #define try_cmpxchg_local(ptr, oldp, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ typeof(oldp) __ai_oldp = (oldp); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \ raw_try_cmpxchg_local(__ai_ptr, __ai_oldp, __VA_ARGS__); \ }) #define try_cmpxchg64_local(ptr, oldp, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ typeof(oldp) __ai_oldp = (oldp); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \ raw_try_cmpxchg64_local(__ai_ptr, __ai_oldp, __VA_ARGS__); \ }) #define try_cmpxchg128_local(ptr, oldp, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ typeof(oldp) __ai_oldp = (oldp); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \ raw_try_cmpxchg128_local(__ai_ptr, __ai_oldp, __VA_ARGS__); \ }) #define sync_try_cmpxchg(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ kcsan_mb(); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_sync_try_cmpxchg(__ai_ptr, __VA_ARGS__); \ }) #endif /* _LINUX_ATOMIC_INSTRUMENTED_H */ // 2cc4bc990fef44d3836ec108f11b610f3f438184 |
5 3 7 7 7 5 6 16 1 2 9 5 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 | // SPDX-License-Identifier: GPL-2.0 /* * linux/kernel/acct.c * * BSD Process Accounting for Linux * * Author: Marco van Wieringen <mvw@planets.elm.net> * * Some code based on ideas and code from: * Thomas K. Dyas <tdyas@eden.rutgers.edu> * * This file implements BSD-style process accounting. Whenever any * process exits, an accounting record of type "struct acct" is * written to the file specified with the acct() system call. It is * up to user-level programs to do useful things with the accounting * log. The kernel just provides the raw accounting information. * * (C) Copyright 1995 - 1997 Marco van Wieringen - ELM Consultancy B.V. * * Plugged two leaks. 1) It didn't return acct_file into the free_filps if * the file happened to be read-only. 2) If the accounting was suspended * due to the lack of space it happily allowed to reopen it and completely * lost the old acct_file. 3/10/98, Al Viro. * * Now we silently close acct_file on attempt to reopen. Cleaned sys_acct(). * XTerms and EMACS are manifestations of pure evil. 21/10/98, AV. * * Fixed a nasty interaction with sys_umount(). If the accounting * was suspeneded we failed to stop it on umount(). Messy. * Another one: remount to readonly didn't stop accounting. * Question: what should we do if we have CAP_SYS_ADMIN but not * CAP_SYS_PACCT? Current code does the following: umount returns -EBUSY * unless we are messing with the root. In that case we are getting a * real mess with do_remount_sb(). 9/11/98, AV. * * Fixed a bunch of races (and pair of leaks). Probably not the best way, * but this one obviously doesn't introduce deadlocks. Later. BTW, found * one race (and leak) in BSD implementation. * OK, that's better. ANOTHER race and leak in BSD variant. There always * is one more bug... 10/11/98, AV. * * Oh, fsck... Oopsable SMP race in do_process_acct() - we must hold * ->mmap_lock to walk the vma list of current->mm. Nasty, since it leaks * a struct file opened for write. Fixed. 2/6/2000, AV. */ #include <linux/mm.h> #include <linux/slab.h> #include <linux/acct.h> #include <linux/capability.h> #include <linux/file.h> #include <linux/tty.h> #include <linux/security.h> #include <linux/vfs.h> #include <linux/jiffies.h> #include <linux/times.h> #include <linux/syscalls.h> #include <linux/mount.h> #include <linux/uaccess.h> #include <linux/sched/cputime.h> #include <asm/div64.h> #include <linux/pid_namespace.h> #include <linux/fs_pin.h> /* * These constants control the amount of freespace that suspend and * resume the process accounting system, and the time delay between * each check. * Turned into sysctl-controllable parameters. AV, 12/11/98 */ static int acct_parm[3] = {4, 2, 30}; #define RESUME (acct_parm[0]) /* >foo% free space - resume */ #define SUSPEND (acct_parm[1]) /* <foo% free space - suspend */ #define ACCT_TIMEOUT (acct_parm[2]) /* foo second timeout between checks */ #ifdef CONFIG_SYSCTL static struct ctl_table kern_acct_table[] = { { .procname = "acct", .data = &acct_parm, .maxlen = 3*sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { } }; static __init int kernel_acct_sysctls_init(void) { register_sysctl_init("kernel", kern_acct_table); return 0; } late_initcall(kernel_acct_sysctls_init); #endif /* CONFIG_SYSCTL */ /* * External references and all of the globals. */ struct bsd_acct_struct { struct fs_pin pin; atomic_long_t count; struct rcu_head rcu; struct mutex lock; int active; unsigned long needcheck; struct file *file; struct pid_namespace *ns; struct work_struct work; struct completion done; }; static void do_acct_process(struct bsd_acct_struct *acct); /* * Check the amount of free space and suspend/resume accordingly. */ static int check_free_space(struct bsd_acct_struct *acct) { struct kstatfs sbuf; if (time_is_after_jiffies(acct->needcheck)) goto out; /* May block */ if (vfs_statfs(&acct->file->f_path, &sbuf)) goto out; if (acct->active) { u64 suspend = sbuf.f_blocks * SUSPEND; do_div(suspend, 100); if (sbuf.f_bavail <= suspend) { acct->active = 0; pr_info("Process accounting paused\n"); } } else { u64 resume = sbuf.f_blocks * RESUME; do_div(resume, 100); if (sbuf.f_bavail >= resume) { acct->active = 1; pr_info("Process accounting resumed\n"); } } acct->needcheck = jiffies + ACCT_TIMEOUT*HZ; out: return acct->active; } static void acct_put(struct bsd_acct_struct *p) { if (atomic_long_dec_and_test(&p->count)) kfree_rcu(p, rcu); } static inline struct bsd_acct_struct *to_acct(struct fs_pin *p) { return p ? container_of(p, struct bsd_acct_struct, pin) : NULL; } static struct bsd_acct_struct *acct_get(struct pid_namespace *ns) { struct bsd_acct_struct *res; again: smp_rmb(); rcu_read_lock(); res = to_acct(READ_ONCE(ns->bacct)); if (!res) { rcu_read_unlock(); return NULL; } if (!atomic_long_inc_not_zero(&res->count)) { rcu_read_unlock(); cpu_relax(); goto again; } rcu_read_unlock(); mutex_lock(&res->lock); if (res != to_acct(READ_ONCE(ns->bacct))) { mutex_unlock(&res->lock); acct_put(res); goto again; } return res; } static void acct_pin_kill(struct fs_pin *pin) { struct bsd_acct_struct *acct = to_acct(pin); mutex_lock(&acct->lock); do_acct_process(acct); schedule_work(&acct->work); wait_for_completion(&acct->done); cmpxchg(&acct->ns->bacct, pin, NULL); mutex_unlock(&acct->lock); pin_remove(pin); acct_put(acct); } static void close_work(struct work_struct *work) { struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work); struct file *file = acct->file; if (file->f_op->flush) file->f_op->flush(file, NULL); __fput_sync(file); complete(&acct->done); } static int acct_on(struct filename *pathname) { struct file *file; struct vfsmount *mnt, *internal; struct pid_namespace *ns = task_active_pid_ns(current); struct bsd_acct_struct *acct; struct fs_pin *old; int err; acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL); if (!acct) return -ENOMEM; /* Difference from BSD - they don't do O_APPEND */ file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0); if (IS_ERR(file)) { kfree(acct); return PTR_ERR(file); } if (!S_ISREG(file_inode(file)->i_mode)) { kfree(acct); filp_close(file, NULL); return -EACCES; } if (!(file->f_mode & FMODE_CAN_WRITE)) { kfree(acct); filp_close(file, NULL); return -EIO; } internal = mnt_clone_internal(&file->f_path); if (IS_ERR(internal)) { kfree(acct); filp_close(file, NULL); return PTR_ERR(internal); } err = mnt_get_write_access(internal); if (err) { mntput(internal); kfree(acct); filp_close(file, NULL); return err; } mnt = file->f_path.mnt; file->f_path.mnt = internal; atomic_long_set(&acct->count, 1); init_fs_pin(&acct->pin, acct_pin_kill); acct->file = file; acct->needcheck = jiffies; acct->ns = ns; mutex_init(&acct->lock); INIT_WORK(&acct->work, close_work); init_completion(&acct->done); mutex_lock_nested(&acct->lock, 1); /* nobody has seen it yet */ pin_insert(&acct->pin, mnt); rcu_read_lock(); old = xchg(&ns->bacct, &acct->pin); mutex_unlock(&acct->lock); pin_kill(old); mnt_put_write_access(mnt); mntput(mnt); return 0; } static DEFINE_MUTEX(acct_on_mutex); /** * sys_acct - enable/disable process accounting * @name: file name for accounting records or NULL to shutdown accounting * * sys_acct() is the only system call needed to implement process * accounting. It takes the name of the file where accounting records * should be written. If the filename is NULL, accounting will be * shutdown. * * Returns: 0 for success or negative errno values for failure. */ SYSCALL_DEFINE1(acct, const char __user *, name) { int error = 0; if (!capable(CAP_SYS_PACCT)) return -EPERM; if (name) { struct filename *tmp = getname(name); if (IS_ERR(tmp)) return PTR_ERR(tmp); mutex_lock(&acct_on_mutex); error = acct_on(tmp); mutex_unlock(&acct_on_mutex); putname(tmp); } else { rcu_read_lock(); pin_kill(task_active_pid_ns(current)->bacct); } return error; } void acct_exit_ns(struct pid_namespace *ns) { rcu_read_lock(); pin_kill(ns->bacct); } /* * encode an u64 into a comp_t * * This routine has been adopted from the encode_comp_t() function in * the kern_acct.c file of the FreeBSD operating system. The encoding * is a 13-bit fraction with a 3-bit (base 8) exponent. */ #define MANTSIZE 13 /* 13 bit mantissa. */ #define EXPSIZE 3 /* Base 8 (3 bit) exponent. */ #define MAXFRACT ((1 << MANTSIZE) - 1) /* Maximum fractional value. */ static comp_t encode_comp_t(u64 value) { int exp, rnd; exp = rnd = 0; while (value > MAXFRACT) { rnd = value & (1 << (EXPSIZE - 1)); /* Round up? */ value >>= EXPSIZE; /* Base 8 exponent == 3 bit shift. */ exp++; } /* * If we need to round up, do it (and handle overflow correctly). */ if (rnd && (++value > MAXFRACT)) { value >>= EXPSIZE; exp++; } if (exp > (((comp_t) ~0U) >> MANTSIZE)) return (comp_t) ~0U; /* * Clean it up and polish it off. */ exp <<= MANTSIZE; /* Shift the exponent into place */ exp += value; /* and add on the mantissa. */ return exp; } #if ACCT_VERSION == 1 || ACCT_VERSION == 2 /* * encode an u64 into a comp2_t (24 bits) * * Format: 5 bit base 2 exponent, 20 bits mantissa. * The leading bit of the mantissa is not stored, but implied for * non-zero exponents. * Largest encodable value is 50 bits. */ #define MANTSIZE2 20 /* 20 bit mantissa. */ #define EXPSIZE2 5 /* 5 bit base 2 exponent. */ #define MAXFRACT2 ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */ #define MAXEXP2 ((1 << EXPSIZE2) - 1) /* Maximum exponent. */ static comp2_t encode_comp2_t(u64 value) { int exp, rnd; exp = (value > (MAXFRACT2>>1)); rnd = 0; while (value > MAXFRACT2) { rnd = value & 1; value >>= 1; exp++; } /* * If we need to round up, do it (and handle overflow correctly). */ if (rnd && (++value > MAXFRACT2)) { value >>= 1; exp++; } if (exp > MAXEXP2) { /* Overflow. Return largest representable number instead. */ return (1ul << (MANTSIZE2+EXPSIZE2-1)) - 1; } else { return (value & (MAXFRACT2>>1)) | (exp << (MANTSIZE2-1)); } } #elif ACCT_VERSION == 3 /* * encode an u64 into a 32 bit IEEE float */ static u32 encode_float(u64 value) { unsigned exp = 190; unsigned u; if (value == 0) return 0; while ((s64)value > 0) { value <<= 1; exp--; } u = (u32)(value >> 40) & 0x7fffffu; return u | (exp << 23); } #endif /* * Write an accounting entry for an exiting process * * The acct_process() call is the workhorse of the process * accounting system. The struct acct is built here and then written * into the accounting file. This function should only be called from * do_exit() or when switching to a different output file. */ static void fill_ac(acct_t *ac) { struct pacct_struct *pacct = ¤t->signal->pacct; u64 elapsed, run_time; time64_t btime; struct tty_struct *tty; /* * Fill the accounting struct with the needed info as recorded * by the different kernel functions. */ memset(ac, 0, sizeof(acct_t)); ac->ac_version = ACCT_VERSION | ACCT_BYTEORDER; strscpy(ac->ac_comm, current->comm, sizeof(ac->ac_comm)); /* calculate run_time in nsec*/ run_time = ktime_get_ns(); run_time -= current->group_leader->start_time; /* convert nsec -> AHZ */ elapsed = nsec_to_AHZ(run_time); #if ACCT_VERSION == 3 ac->ac_etime = encode_float(elapsed); #else ac->ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ? (unsigned long) elapsed : (unsigned long) -1l); #endif #if ACCT_VERSION == 1 || ACCT_VERSION == 2 { /* new enlarged etime field */ comp2_t etime = encode_comp2_t(elapsed); ac->ac_etime_hi = etime >> 16; ac->ac_etime_lo = (u16) etime; } #endif do_div(elapsed, AHZ); btime = ktime_get_real_seconds() - elapsed; ac->ac_btime = clamp_t(time64_t, btime, 0, U32_MAX); #if ACCT_VERSION == 2 ac->ac_ahz = AHZ; #endif spin_lock_irq(¤t->sighand->siglock); tty = current->signal->tty; /* Safe as we hold the siglock */ ac->ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0; ac->ac_utime = encode_comp_t(nsec_to_AHZ(pacct->ac_utime)); ac->ac_stime = encode_comp_t(nsec_to_AHZ(pacct->ac_stime)); ac->ac_flag = pacct->ac_flag; ac->ac_mem = encode_comp_t(pacct->ac_mem); ac->ac_minflt = encode_comp_t(pacct->ac_minflt); ac->ac_majflt = encode_comp_t(pacct->ac_majflt); ac->ac_exitcode = pacct->ac_exitcode; spin_unlock_irq(¤t->sighand->siglock); } /* * do_acct_process does all actual work. Caller holds the reference to file. */ static void do_acct_process(struct bsd_acct_struct *acct) { acct_t ac; unsigned long flim; const struct cred *orig_cred; struct file *file = acct->file; /* * Accounting records are not subject to resource limits. */ flim = rlimit(RLIMIT_FSIZE); current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; /* Perform file operations on behalf of whoever enabled accounting */ orig_cred = override_creds(file->f_cred); /* * First check to see if there is enough free_space to continue * the process accounting system. */ if (!check_free_space(acct)) goto out; fill_ac(&ac); /* we really need to bite the bullet and change layout */ ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid); ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid); #if ACCT_VERSION == 1 || ACCT_VERSION == 2 /* backward-compatible 16 bit fields */ ac.ac_uid16 = ac.ac_uid; ac.ac_gid16 = ac.ac_gid; #elif ACCT_VERSION == 3 { struct pid_namespace *ns = acct->ns; ac.ac_pid = task_tgid_nr_ns(current, ns); rcu_read_lock(); ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns); rcu_read_unlock(); } #endif /* * Get freeze protection. If the fs is frozen, just skip the write * as we could deadlock the system otherwise. */ if (file_start_write_trylock(file)) { /* it's been opened O_APPEND, so position is irrelevant */ loff_t pos = 0; __kernel_write(file, &ac, sizeof(acct_t), &pos); file_end_write(file); } out: current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim; revert_creds(orig_cred); } /** * acct_collect - collect accounting information into pacct_struct * @exitcode: task exit code * @group_dead: not 0, if this thread is the last one in the process. */ void acct_collect(long exitcode, int group_dead) { struct pacct_struct *pacct = ¤t->signal->pacct; u64 utime, stime; unsigned long vsize = 0; if (group_dead && current->mm) { struct mm_struct *mm = current->mm; VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; mmap_read_lock(mm); for_each_vma(vmi, vma) vsize += vma->vm_end - vma->vm_start; mmap_read_unlock(mm); } spin_lock_irq(¤t->sighand->siglock); if (group_dead) pacct->ac_mem = vsize / 1024; if (thread_group_leader(current)) { pacct->ac_exitcode = exitcode; if (current->flags & PF_FORKNOEXEC) pacct->ac_flag |= AFORK; } if (current->flags & PF_SUPERPRIV) pacct->ac_flag |= ASU; if (current->flags & PF_DUMPCORE) pacct->ac_flag |= ACORE; if (current->flags & PF_SIGNALED) pacct->ac_flag |= AXSIG; task_cputime(current, &utime, &stime); pacct->ac_utime += utime; pacct->ac_stime += stime; pacct->ac_minflt += current->min_flt; pacct->ac_majflt += current->maj_flt; spin_unlock_irq(¤t->sighand->siglock); } static void slow_acct_process(struct pid_namespace *ns) { for ( ; ns; ns = ns->parent) { struct bsd_acct_struct *acct = acct_get(ns); if (acct) { do_acct_process(acct); mutex_unlock(&acct->lock); acct_put(acct); } } } /** * acct_process - handles process accounting for an exiting task */ void acct_process(void) { struct pid_namespace *ns; /* * This loop is safe lockless, since current is still * alive and holds its namespace, which in turn holds * its parent. */ for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent) { if (ns->bacct) break; } if (unlikely(ns)) slow_acct_process(ns); } |
1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. */ #include <linux/module.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/completion.h> #include <linux/buffer_head.h> #include <linux/gfs2_ondisk.h> #include <linux/crc32.h> #include <linux/crc32c.h> #include <linux/ktime.h> #include "gfs2.h" #include "incore.h" #include "bmap.h" #include "glock.h" #include "glops.h" #include "log.h" #include "lops.h" #include "meta_io.h" #include "recovery.h" #include "super.h" #include "util.h" #include "dir.h" struct workqueue_struct *gfs2_recovery_wq; int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk, struct buffer_head **bh) { struct gfs2_inode *ip = GFS2_I(jd->jd_inode); struct gfs2_glock *gl = ip->i_gl; u64 dblock; u32 extlen; int error; extlen = 32; error = gfs2_get_extent(&ip->i_inode, blk, &dblock, &extlen); if (error) return error; if (!dblock) { gfs2_consist_inode(ip); return -EIO; } *bh = gfs2_meta_ra(gl, dblock, extlen); return error; } int gfs2_revoke_add(struct gfs2_jdesc *jd, u64 blkno, unsigned int where) { struct list_head *head = &jd->jd_revoke_list; struct gfs2_revoke_replay *rr = NULL, *iter; list_for_each_entry(iter, head, rr_list) { if (iter->rr_blkno == blkno) { rr = iter; break; } } if (rr) { rr->rr_where = where; return 0; } rr = kmalloc(sizeof(struct gfs2_revoke_replay), GFP_NOFS); if (!rr) return -ENOMEM; rr->rr_blkno = blkno; rr->rr_where = where; list_add(&rr->rr_list, head); return 1; } int gfs2_revoke_check(struct gfs2_jdesc *jd, u64 blkno, unsigned int where) { struct gfs2_revoke_replay *rr = NULL, *iter; int wrap, a, b, revoke; list_for_each_entry(iter, &jd->jd_revoke_list, rr_list) { if (iter->rr_blkno == blkno) { rr = iter; break; } } if (!rr) return 0; wrap = (rr->rr_where < jd->jd_replay_tail); a = (jd->jd_replay_tail < where); b = (where < rr->rr_where); revoke = (wrap) ? (a || b) : (a && b); return revoke; } void gfs2_revoke_clean(struct gfs2_jdesc *jd) { struct list_head *head = &jd->jd_revoke_list; struct gfs2_revoke_replay *rr; while (!list_empty(head)) { rr = list_first_entry(head, struct gfs2_revoke_replay, rr_list); list_del(&rr->rr_list); kfree(rr); } } int __get_log_header(struct gfs2_sbd *sdp, const struct gfs2_log_header *lh, unsigned int blkno, struct gfs2_log_header_host *head) { u32 hash, crc; if (lh->lh_header.mh_magic != cpu_to_be32(GFS2_MAGIC) || lh->lh_header.mh_type != cpu_to_be32(GFS2_METATYPE_LH) || (blkno && be32_to_cpu(lh->lh_blkno) != blkno)) return 1; hash = crc32(~0, lh, LH_V1_SIZE - 4); hash = ~crc32_le_shift(hash, 4); /* assume lh_hash is zero */ if (be32_to_cpu(lh->lh_hash) != hash) return 1; crc = crc32c(~0, (void *)lh + LH_V1_SIZE + 4, sdp->sd_sb.sb_bsize - LH_V1_SIZE - 4); if ((lh->lh_crc != 0 && be32_to_cpu(lh->lh_crc) != crc)) return 1; head->lh_sequence = be64_to_cpu(lh->lh_sequence); head->lh_flags = be32_to_cpu(lh->lh_flags); head->lh_tail = be32_to_cpu(lh->lh_tail); head->lh_blkno = be32_to_cpu(lh->lh_blkno); head->lh_local_total = be64_to_cpu(lh->lh_local_total); head->lh_local_free = be64_to_cpu(lh->lh_local_free); head->lh_local_dinodes = be64_to_cpu(lh->lh_local_dinodes); return 0; } /** * get_log_header - read the log header for a given segment * @jd: the journal * @blk: the block to look at * @head: the log header to return * * Read the log header for a given segement in a given journal. Do a few * sanity checks on it. * * Returns: 0 on success, * 1 if the header was invalid or incomplete, * errno on error */ static int get_log_header(struct gfs2_jdesc *jd, unsigned int blk, struct gfs2_log_header_host *head) { struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); struct buffer_head *bh; int error; error = gfs2_replay_read_block(jd, blk, &bh); if (error) return error; error = __get_log_header(sdp, (const struct gfs2_log_header *)bh->b_data, blk, head); brelse(bh); return error; } /** * foreach_descriptor - go through the active part of the log * @jd: the journal * @start: the first log header in the active region * @end: the last log header (don't process the contents of this entry)) * @pass: iteration number (foreach_descriptor() is called in a for() loop) * * Call a given function once for every log descriptor in the active * portion of the log. * * Returns: errno */ static int foreach_descriptor(struct gfs2_jdesc *jd, u32 start, unsigned int end, int pass) { struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); struct buffer_head *bh; struct gfs2_log_descriptor *ld; int error = 0; u32 length; __be64 *ptr; unsigned int offset = sizeof(struct gfs2_log_descriptor); offset += sizeof(__be64) - 1; offset &= ~(sizeof(__be64) - 1); while (start != end) { error = gfs2_replay_read_block(jd, start, &bh); if (error) return error; if (gfs2_meta_check(sdp, bh)) { brelse(bh); return -EIO; } ld = (struct gfs2_log_descriptor *)bh->b_data; length = be32_to_cpu(ld->ld_length); if (be32_to_cpu(ld->ld_header.mh_type) == GFS2_METATYPE_LH) { struct gfs2_log_header_host lh; error = get_log_header(jd, start, &lh); if (!error) { gfs2_replay_incr_blk(jd, &start); brelse(bh); continue; } if (error == 1) { gfs2_consist_inode(GFS2_I(jd->jd_inode)); error = -EIO; } brelse(bh); return error; } else if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_LD)) { brelse(bh); return -EIO; } ptr = (__be64 *)(bh->b_data + offset); error = lops_scan_elements(jd, start, ld, ptr, pass); if (error) { brelse(bh); return error; } while (length--) gfs2_replay_incr_blk(jd, &start); brelse(bh); } return 0; } /** * clean_journal - mark a dirty journal as being clean * @jd: the journal * @head: the head journal to start from * * Returns: errno */ static void clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head) { struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); u32 lblock = head->lh_blkno; gfs2_replay_incr_blk(jd, &lblock); gfs2_write_log_header(sdp, jd, head->lh_sequence + 1, 0, lblock, GFS2_LOG_HEAD_UNMOUNT | GFS2_LOG_HEAD_RECOVERY, REQ_PREFLUSH | REQ_FUA | REQ_META | REQ_SYNC); if (jd->jd_jid == sdp->sd_lockstruct.ls_jid) { sdp->sd_log_flush_head = lblock; gfs2_log_incr_head(sdp); } } static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid, unsigned int message) { char env_jid[20]; char env_status[20]; char *envp[] = { env_jid, env_status, NULL }; struct lm_lockstruct *ls = &sdp->sd_lockstruct; ls->ls_recover_jid_done = jid; ls->ls_recover_jid_status = message; sprintf(env_jid, "JID=%u", jid); sprintf(env_status, "RECOVERY=%s", message == LM_RD_SUCCESS ? "Done" : "Failed"); kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp); if (sdp->sd_lockstruct.ls_ops->lm_recovery_result) sdp->sd_lockstruct.ls_ops->lm_recovery_result(sdp, jid, message); } /** * update_statfs_inode - Update the master statfs inode or zero out the local * statfs inode for a given journal. * @jd: The journal * @head: If NULL, @inode is the local statfs inode and we need to zero it out. * Otherwise, it @head contains the statfs change info that needs to be * synced to the master statfs inode (pointed to by @inode). * @inode: statfs inode to update. */ static int update_statfs_inode(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, struct inode *inode) { struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); struct gfs2_inode *ip; struct buffer_head *bh; struct gfs2_statfs_change_host sc; int error = 0; BUG_ON(!inode); ip = GFS2_I(inode); error = gfs2_meta_inode_buffer(ip, &bh); if (error) goto out; spin_lock(&sdp->sd_statfs_spin); if (head) { /* Update the master statfs inode */ gfs2_statfs_change_in(&sc, bh->b_data + sizeof(struct gfs2_dinode)); sc.sc_total += head->lh_local_total; sc.sc_free += head->lh_local_free; sc.sc_dinodes += head->lh_local_dinodes; gfs2_statfs_change_out(&sc, bh->b_data + sizeof(struct gfs2_dinode)); fs_info(sdp, "jid=%u: Updated master statfs Total:%lld, " "Free:%lld, Dinodes:%lld after change " "[%+lld,%+lld,%+lld]\n", jd->jd_jid, sc.sc_total, sc.sc_free, sc.sc_dinodes, head->lh_local_total, head->lh_local_free, head->lh_local_dinodes); } else { /* Zero out the local statfs inode */ memset(bh->b_data + sizeof(struct gfs2_dinode), 0, sizeof(struct gfs2_statfs_change)); /* If it's our own journal, reset any in-memory changes too */ if (jd->jd_jid == sdp->sd_lockstruct.ls_jid) { memset(&sdp->sd_statfs_local, 0, sizeof(struct gfs2_statfs_change_host)); } } spin_unlock(&sdp->sd_statfs_spin); mark_buffer_dirty(bh); brelse(bh); gfs2_inode_metasync(ip->i_gl); out: return error; } /** * recover_local_statfs - Update the master and local statfs changes for this * journal. * * Previously, statfs updates would be read in from the local statfs inode and * synced to the master statfs inode during recovery. * * We now use the statfs updates in the journal head to update the master statfs * inode instead of reading in from the local statfs inode. To preserve backward * compatibility with kernels that can't do this, we still need to keep the * local statfs inode up to date by writing changes to it. At some point in the * future, we can do away with the local statfs inodes altogether and keep the * statfs changes solely in the journal. * * @jd: the journal * @head: the journal head * * Returns: errno */ static void recover_local_statfs(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head) { int error; struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); if (!head->lh_local_total && !head->lh_local_free && !head->lh_local_dinodes) /* No change */ goto zero_local; /* First update the master statfs inode with the changes we * found in the journal. */ error = update_statfs_inode(jd, head, sdp->sd_statfs_inode); if (error) goto out; zero_local: /* Zero out the local statfs inode so any changes in there * are not re-recovered. */ error = update_statfs_inode(jd, NULL, find_local_statfs_inode(sdp, jd->jd_jid)); out: return; } void gfs2_recover_func(struct work_struct *work) { struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work); struct gfs2_inode *ip = GFS2_I(jd->jd_inode); struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); struct gfs2_log_header_host head; struct gfs2_holder j_gh, ji_gh; ktime_t t_start, t_jlck, t_jhd, t_tlck, t_rep; int ro = 0; unsigned int pass; int error = 0; int jlocked = 0; if (gfs2_withdrawing_or_withdrawn(sdp)) { fs_err(sdp, "jid=%u: Recovery not attempted due to withdraw.\n", jd->jd_jid); goto fail; } t_start = ktime_get(); if (sdp->sd_args.ar_spectator) goto fail; if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) { fs_info(sdp, "jid=%u: Trying to acquire journal glock...\n", jd->jd_jid); jlocked = 1; /* Acquire the journal glock so we can do recovery */ error = gfs2_glock_nq_num(sdp, jd->jd_jid, &gfs2_journal_glops, LM_ST_EXCLUSIVE, LM_FLAG_NOEXP | LM_FLAG_TRY | GL_NOCACHE, &j_gh); switch (error) { case 0: break; case GLR_TRYFAILED: fs_info(sdp, "jid=%u: Busy\n", jd->jd_jid); error = 0; goto fail; default: goto fail; } error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_NOEXP | GL_NOCACHE, &ji_gh); if (error) goto fail_gunlock_j; } else { fs_info(sdp, "jid=%u, already locked for use\n", jd->jd_jid); } t_jlck = ktime_get(); fs_info(sdp, "jid=%u: Looking at journal...\n", jd->jd_jid); error = gfs2_jdesc_check(jd); if (error) goto fail_gunlock_ji; error = gfs2_find_jhead(jd, &head, true); if (error) goto fail_gunlock_ji; t_jhd = ktime_get(); fs_info(sdp, "jid=%u: Journal head lookup took %lldms\n", jd->jd_jid, ktime_ms_delta(t_jhd, t_jlck)); if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) { mutex_lock(&sdp->sd_freeze_mutex); if (test_bit(SDF_FROZEN, &sdp->sd_flags)) { mutex_unlock(&sdp->sd_freeze_mutex); fs_warn(sdp, "jid=%u: Can't replay: filesystem " "is frozen\n", jd->jd_jid); goto fail_gunlock_ji; } if (test_bit(SDF_RORECOVERY, &sdp->sd_flags)) { ro = 1; } else if (test_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags)) { if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) ro = 1; } else { if (sb_rdonly(sdp->sd_vfs)) { /* check if device itself is read-only */ ro = bdev_read_only(sdp->sd_vfs->s_bdev); if (!ro) { fs_info(sdp, "recovery required on " "read-only filesystem.\n"); fs_info(sdp, "write access will be " "enabled during recovery.\n"); } } } if (ro) { fs_warn(sdp, "jid=%u: Can't replay: read-only block " "device\n", jd->jd_jid); error = -EROFS; goto fail_gunlock_nofreeze; } t_tlck = ktime_get(); fs_info(sdp, "jid=%u: Replaying journal...0x%x to 0x%x\n", jd->jd_jid, head.lh_tail, head.lh_blkno); /* We take the sd_log_flush_lock here primarily to prevent log * flushes and simultaneous journal replays from stomping on * each other wrt jd_log_bio. */ down_read(&sdp->sd_log_flush_lock); for (pass = 0; pass < 2; pass++) { lops_before_scan(jd, &head, pass); error = foreach_descriptor(jd, head.lh_tail, head.lh_blkno, pass); lops_after_scan(jd, error, pass); if (error) { up_read(&sdp->sd_log_flush_lock); goto fail_gunlock_nofreeze; } } recover_local_statfs(jd, &head); clean_journal(jd, &head); up_read(&sdp->sd_log_flush_lock); mutex_unlock(&sdp->sd_freeze_mutex); t_rep = ktime_get(); fs_info(sdp, "jid=%u: Journal replayed in %lldms [jlck:%lldms, " "jhead:%lldms, tlck:%lldms, replay:%lldms]\n", jd->jd_jid, ktime_ms_delta(t_rep, t_start), ktime_ms_delta(t_jlck, t_start), ktime_ms_delta(t_jhd, t_jlck), ktime_ms_delta(t_tlck, t_jhd), ktime_ms_delta(t_rep, t_tlck)); } gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS); if (jlocked) { gfs2_glock_dq_uninit(&ji_gh); gfs2_glock_dq_uninit(&j_gh); } fs_info(sdp, "jid=%u: Done\n", jd->jd_jid); goto done; fail_gunlock_nofreeze: mutex_unlock(&sdp->sd_freeze_mutex); fail_gunlock_ji: if (jlocked) { gfs2_glock_dq_uninit(&ji_gh); fail_gunlock_j: gfs2_glock_dq_uninit(&j_gh); } fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done"); fail: jd->jd_recover_error = error; gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP); done: clear_bit(JDF_RECOVERY, &jd->jd_flags); smp_mb__after_atomic(); wake_up_bit(&jd->jd_flags, JDF_RECOVERY); } int gfs2_recover_journal(struct gfs2_jdesc *jd, bool wait) { int rv; if (test_and_set_bit(JDF_RECOVERY, &jd->jd_flags)) return -EBUSY; /* we have JDF_RECOVERY, queue should always succeed */ rv = queue_work(gfs2_recovery_wq, &jd->jd_work); BUG_ON(!rv); if (wait) wait_on_bit(&jd->jd_flags, JDF_RECOVERY, TASK_UNINTERRUPTIBLE); return wait ? jd->jd_recover_error : 0; } |
33 33 33 1 7 33 25 5 4 48 6 40 16 4 11 26 15 40 12 28 11 16 22 5 6 32 4 4 2 2 4 13 13 25 3 4 19 23 6 2 6 14 2 2 12 2 1 2 7 9 3 32 3 29 8 6 17 22 1 49 2 1 7 40 1 40 12 28 36 1 3 40 40 40 21 5 10 2 4 16 16 52 2 12 39 21 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 | // SPDX-License-Identifier: GPL-2.0 /* * fs/timerfd.c * * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org> * * * Thanks to Thomas Gleixner for code reviews and useful comments. * */ #include <linux/alarmtimer.h> #include <linux/file.h> #include <linux/poll.h> #include <linux/init.h> #include <linux/fs.h> #include <linux/sched.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/time.h> #include <linux/hrtimer.h> #include <linux/anon_inodes.h> #include <linux/timerfd.h> #include <linux/syscalls.h> #include <linux/compat.h> #include <linux/rcupdate.h> #include <linux/time_namespace.h> struct timerfd_ctx { union { struct hrtimer tmr; struct alarm alarm; } t; ktime_t tintv; ktime_t moffs; wait_queue_head_t wqh; u64 ticks; int clockid; short unsigned expired; short unsigned settime_flags; /* to show in fdinfo */ struct rcu_head rcu; struct list_head clist; spinlock_t cancel_lock; bool might_cancel; }; static LIST_HEAD(cancel_list); static DEFINE_SPINLOCK(cancel_lock); static inline bool isalarm(struct timerfd_ctx *ctx) { return ctx->clockid == CLOCK_REALTIME_ALARM || ctx->clockid == CLOCK_BOOTTIME_ALARM; } /* * This gets called when the timer event triggers. We set the "expired" * flag, but we do not re-arm the timer (in case it's necessary, * tintv != 0) until the timer is accessed. */ static void timerfd_triggered(struct timerfd_ctx *ctx) { unsigned long flags; spin_lock_irqsave(&ctx->wqh.lock, flags); ctx->expired = 1; ctx->ticks++; wake_up_locked_poll(&ctx->wqh, EPOLLIN); spin_unlock_irqrestore(&ctx->wqh.lock, flags); } static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr) { struct timerfd_ctx *ctx = container_of(htmr, struct timerfd_ctx, t.tmr); timerfd_triggered(ctx); return HRTIMER_NORESTART; } static enum alarmtimer_restart timerfd_alarmproc(struct alarm *alarm, ktime_t now) { struct timerfd_ctx *ctx = container_of(alarm, struct timerfd_ctx, t.alarm); timerfd_triggered(ctx); return ALARMTIMER_NORESTART; } /* * Called when the clock was set to cancel the timers in the cancel * list. This will wake up processes waiting on these timers. The * wake-up requires ctx->ticks to be non zero, therefore we increment * it before calling wake_up_locked(). */ void timerfd_clock_was_set(void) { ktime_t moffs = ktime_mono_to_real(0); struct timerfd_ctx *ctx; unsigned long flags; rcu_read_lock(); list_for_each_entry_rcu(ctx, &cancel_list, clist) { if (!ctx->might_cancel) continue; spin_lock_irqsave(&ctx->wqh.lock, flags); if (ctx->moffs != moffs) { ctx->moffs = KTIME_MAX; ctx->ticks++; wake_up_locked_poll(&ctx->wqh, EPOLLIN); } spin_unlock_irqrestore(&ctx->wqh.lock, flags); } rcu_read_unlock(); } static void timerfd_resume_work(struct work_struct *work) { timerfd_clock_was_set(); } static DECLARE_WORK(timerfd_work, timerfd_resume_work); /* * Invoked from timekeeping_resume(). Defer the actual update to work so * timerfd_clock_was_set() runs in task context. */ void timerfd_resume(void) { schedule_work(&timerfd_work); } static void __timerfd_remove_cancel(struct timerfd_ctx *ctx) { if (ctx->might_cancel) { ctx->might_cancel = false; spin_lock(&cancel_lock); list_del_rcu(&ctx->clist); spin_unlock(&cancel_lock); } } static void timerfd_remove_cancel(struct timerfd_ctx *ctx) { spin_lock(&ctx->cancel_lock); __timerfd_remove_cancel(ctx); spin_unlock(&ctx->cancel_lock); } static bool timerfd_canceled(struct timerfd_ctx *ctx) { if (!ctx->might_cancel || ctx->moffs != KTIME_MAX) return false; ctx->moffs = ktime_mono_to_real(0); return true; } static void timerfd_setup_cancel(struct timerfd_ctx *ctx, int flags) { spin_lock(&ctx->cancel_lock); if ((ctx->clockid == CLOCK_REALTIME || ctx->clockid == CLOCK_REALTIME_ALARM) && (flags & TFD_TIMER_ABSTIME) && (flags & TFD_TIMER_CANCEL_ON_SET)) { if (!ctx->might_cancel) { ctx->might_cancel = true; spin_lock(&cancel_lock); list_add_rcu(&ctx->clist, &cancel_list); spin_unlock(&cancel_lock); } } else { __timerfd_remove_cancel(ctx); } spin_unlock(&ctx->cancel_lock); } static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx) { ktime_t remaining; if (isalarm(ctx)) remaining = alarm_expires_remaining(&ctx->t.alarm); else remaining = hrtimer_expires_remaining_adjusted(&ctx->t.tmr); return remaining < 0 ? 0: remaining; } static int timerfd_setup(struct timerfd_ctx *ctx, int flags, const struct itimerspec64 *ktmr) { enum hrtimer_mode htmode; ktime_t texp; int clockid = ctx->clockid; htmode = (flags & TFD_TIMER_ABSTIME) ? HRTIMER_MODE_ABS: HRTIMER_MODE_REL; texp = timespec64_to_ktime(ktmr->it_value); ctx->expired = 0; ctx->ticks = 0; ctx->tintv = timespec64_to_ktime(ktmr->it_interval); if (isalarm(ctx)) { alarm_init(&ctx->t.alarm, ctx->clockid == CLOCK_REALTIME_ALARM ? ALARM_REALTIME : ALARM_BOOTTIME, timerfd_alarmproc); } else { hrtimer_init(&ctx->t.tmr, clockid, htmode); hrtimer_set_expires(&ctx->t.tmr, texp); ctx->t.tmr.function = timerfd_tmrproc; } if (texp != 0) { if (flags & TFD_TIMER_ABSTIME) texp = timens_ktime_to_host(clockid, texp); if (isalarm(ctx)) { if (flags & TFD_TIMER_ABSTIME) alarm_start(&ctx->t.alarm, texp); else alarm_start_relative(&ctx->t.alarm, texp); } else { hrtimer_start(&ctx->t.tmr, texp, htmode); } if (timerfd_canceled(ctx)) return -ECANCELED; } ctx->settime_flags = flags & TFD_SETTIME_FLAGS; return 0; } static int timerfd_release(struct inode *inode, struct file *file) { struct timerfd_ctx *ctx = file->private_data; timerfd_remove_cancel(ctx); if (isalarm(ctx)) alarm_cancel(&ctx->t.alarm); else hrtimer_cancel(&ctx->t.tmr); kfree_rcu(ctx, rcu); return 0; } static __poll_t timerfd_poll(struct file *file, poll_table *wait) { struct timerfd_ctx *ctx = file->private_data; __poll_t events = 0; unsigned long flags; poll_wait(file, &ctx->wqh, wait); spin_lock_irqsave(&ctx->wqh.lock, flags); if (ctx->ticks) events |= EPOLLIN; spin_unlock_irqrestore(&ctx->wqh.lock, flags); return events; } static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct timerfd_ctx *ctx = file->private_data; ssize_t res; u64 ticks = 0; if (count < sizeof(ticks)) return -EINVAL; spin_lock_irq(&ctx->wqh.lock); if (file->f_flags & O_NONBLOCK) res = -EAGAIN; else res = wait_event_interruptible_locked_irq(ctx->wqh, ctx->ticks); /* * If clock has changed, we do not care about the * ticks and we do not rearm the timer. Userspace must * reevaluate anyway. */ if (timerfd_canceled(ctx)) { ctx->ticks = 0; ctx->expired = 0; res = -ECANCELED; } if (ctx->ticks) { ticks = ctx->ticks; if (ctx->expired && ctx->tintv) { /* * If tintv != 0, this is a periodic timer that * needs to be re-armed. We avoid doing it in the timer * callback to avoid DoS attacks specifying a very * short timer period. */ if (isalarm(ctx)) { ticks += alarm_forward_now( &ctx->t.alarm, ctx->tintv) - 1; alarm_restart(&ctx->t.alarm); } else { ticks += hrtimer_forward_now(&ctx->t.tmr, ctx->tintv) - 1; hrtimer_restart(&ctx->t.tmr); } } ctx->expired = 0; ctx->ticks = 0; } spin_unlock_irq(&ctx->wqh.lock); if (ticks) res = put_user(ticks, (u64 __user *) buf) ? -EFAULT: sizeof(ticks); return res; } #ifdef CONFIG_PROC_FS static void timerfd_show(struct seq_file *m, struct file *file) { struct timerfd_ctx *ctx = file->private_data; struct timespec64 value, interval; spin_lock_irq(&ctx->wqh.lock); value = ktime_to_timespec64(timerfd_get_remaining(ctx)); interval = ktime_to_timespec64(ctx->tintv); spin_unlock_irq(&ctx->wqh.lock); seq_printf(m, "clockid: %d\n" "ticks: %llu\n" "settime flags: 0%o\n" "it_value: (%llu, %llu)\n" "it_interval: (%llu, %llu)\n", ctx->clockid, (unsigned long long)ctx->ticks, ctx->settime_flags, (unsigned long long)value.tv_sec, (unsigned long long)value.tv_nsec, (unsigned long long)interval.tv_sec, (unsigned long long)interval.tv_nsec); } #else #define timerfd_show NULL #endif #ifdef CONFIG_CHECKPOINT_RESTORE static long timerfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct timerfd_ctx *ctx = file->private_data; int ret = 0; switch (cmd) { case TFD_IOC_SET_TICKS: { u64 ticks; if (copy_from_user(&ticks, (u64 __user *)arg, sizeof(ticks))) return -EFAULT; if (!ticks) return -EINVAL; spin_lock_irq(&ctx->wqh.lock); if (!timerfd_canceled(ctx)) { ctx->ticks = ticks; wake_up_locked_poll(&ctx->wqh, EPOLLIN); } else ret = -ECANCELED; spin_unlock_irq(&ctx->wqh.lock); break; } default: ret = -ENOTTY; break; } return ret; } #else #define timerfd_ioctl NULL #endif static const struct file_operations timerfd_fops = { .release = timerfd_release, .poll = timerfd_poll, .read = timerfd_read, .llseek = noop_llseek, .show_fdinfo = timerfd_show, .unlocked_ioctl = timerfd_ioctl, }; static int timerfd_fget(int fd, struct fd *p) { struct fd f = fdget(fd); if (!f.file) return -EBADF; if (f.file->f_op != &timerfd_fops) { fdput(f); return -EINVAL; } *p = f; return 0; } SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) { int ufd; struct timerfd_ctx *ctx; /* Check the TFD_* constants for consistency. */ BUILD_BUG_ON(TFD_CLOEXEC != O_CLOEXEC); BUILD_BUG_ON(TFD_NONBLOCK != O_NONBLOCK); if ((flags & ~TFD_CREATE_FLAGS) || (clockid != CLOCK_MONOTONIC && clockid != CLOCK_REALTIME && clockid != CLOCK_REALTIME_ALARM && clockid != CLOCK_BOOTTIME && clockid != CLOCK_BOOTTIME_ALARM)) return -EINVAL; if ((clockid == CLOCK_REALTIME_ALARM || clockid == CLOCK_BOOTTIME_ALARM) && !capable(CAP_WAKE_ALARM)) return -EPERM; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return -ENOMEM; init_waitqueue_head(&ctx->wqh); spin_lock_init(&ctx->cancel_lock); ctx->clockid = clockid; if (isalarm(ctx)) alarm_init(&ctx->t.alarm, ctx->clockid == CLOCK_REALTIME_ALARM ? ALARM_REALTIME : ALARM_BOOTTIME, timerfd_alarmproc); else hrtimer_init(&ctx->t.tmr, clockid, HRTIMER_MODE_ABS); ctx->moffs = ktime_mono_to_real(0); ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx, O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS)); if (ufd < 0) kfree(ctx); return ufd; } static int do_timerfd_settime(int ufd, int flags, const struct itimerspec64 *new, struct itimerspec64 *old) { struct fd f; struct timerfd_ctx *ctx; int ret; if ((flags & ~TFD_SETTIME_FLAGS) || !itimerspec64_valid(new)) return -EINVAL; ret = timerfd_fget(ufd, &f); if (ret) return ret; ctx = f.file->private_data; if (isalarm(ctx) && !capable(CAP_WAKE_ALARM)) { fdput(f); return -EPERM; } timerfd_setup_cancel(ctx, flags); /* * We need to stop the existing timer before reprogramming * it to the new values. */ for (;;) { spin_lock_irq(&ctx->wqh.lock); if (isalarm(ctx)) { if (alarm_try_to_cancel(&ctx->t.alarm) >= 0) break; } else { if (hrtimer_try_to_cancel(&ctx->t.tmr) >= 0) break; } spin_unlock_irq(&ctx->wqh.lock); if (isalarm(ctx)) hrtimer_cancel_wait_running(&ctx->t.alarm.timer); else hrtimer_cancel_wait_running(&ctx->t.tmr); } /* * If the timer is expired and it's periodic, we need to advance it * because the caller may want to know the previous expiration time. * We do not update "ticks" and "expired" since the timer will be * re-programmed again in the following timerfd_setup() call. */ if (ctx->expired && ctx->tintv) { if (isalarm(ctx)) alarm_forward_now(&ctx->t.alarm, ctx->tintv); else hrtimer_forward_now(&ctx->t.tmr, ctx->tintv); } old->it_value = ktime_to_timespec64(timerfd_get_remaining(ctx)); old->it_interval = ktime_to_timespec64(ctx->tintv); /* * Re-program the timer to the new value ... */ ret = timerfd_setup(ctx, flags, new); spin_unlock_irq(&ctx->wqh.lock); fdput(f); return ret; } static int do_timerfd_gettime(int ufd, struct itimerspec64 *t) { struct fd f; struct timerfd_ctx *ctx; int ret = timerfd_fget(ufd, &f); if (ret) return ret; ctx = f.file->private_data; spin_lock_irq(&ctx->wqh.lock); if (ctx->expired && ctx->tintv) { ctx->expired = 0; if (isalarm(ctx)) { ctx->ticks += alarm_forward_now( &ctx->t.alarm, ctx->tintv) - 1; alarm_restart(&ctx->t.alarm); } else { ctx->ticks += hrtimer_forward_now(&ctx->t.tmr, ctx->tintv) - 1; hrtimer_restart(&ctx->t.tmr); } } t->it_value = ktime_to_timespec64(timerfd_get_remaining(ctx)); t->it_interval = ktime_to_timespec64(ctx->tintv); spin_unlock_irq(&ctx->wqh.lock); fdput(f); return 0; } SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, const struct __kernel_itimerspec __user *, utmr, struct __kernel_itimerspec __user *, otmr) { struct itimerspec64 new, old; int ret; if (get_itimerspec64(&new, utmr)) return -EFAULT; ret = do_timerfd_settime(ufd, flags, &new, &old); if (ret) return ret; if (otmr && put_itimerspec64(&old, otmr)) return -EFAULT; return ret; } SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct __kernel_itimerspec __user *, otmr) { struct itimerspec64 kotmr; int ret = do_timerfd_gettime(ufd, &kotmr); if (ret) return ret; return put_itimerspec64(&kotmr, otmr) ? -EFAULT : 0; } #ifdef CONFIG_COMPAT_32BIT_TIME SYSCALL_DEFINE4(timerfd_settime32, int, ufd, int, flags, const struct old_itimerspec32 __user *, utmr, struct old_itimerspec32 __user *, otmr) { struct itimerspec64 new, old; int ret; if (get_old_itimerspec32(&new, utmr)) return -EFAULT; ret = do_timerfd_settime(ufd, flags, &new, &old); if (ret) return ret; if (otmr && put_old_itimerspec32(&old, otmr)) return -EFAULT; return ret; } SYSCALL_DEFINE2(timerfd_gettime32, int, ufd, struct old_itimerspec32 __user *, otmr) { struct itimerspec64 kotmr; int ret = do_timerfd_gettime(ufd, &kotmr); if (ret) return ret; return put_old_itimerspec32(&kotmr, otmr) ? -EFAULT : 0; } #endif |
1 1 1 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 | // SPDX-License-Identifier: GPL-2.0+ /* * Apple Cinema Display driver * * Copyright (C) 2006 Michael Hanselmann (linux-kernel@hansmi.ch) * * Thanks to Caskey L. Dickson for his work with acdctl. */ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/init.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/usb.h> #include <linux/backlight.h> #include <linux/timer.h> #include <linux/workqueue.h> #include <linux/atomic.h> #define APPLE_VENDOR_ID 0x05AC #define USB_REQ_GET_REPORT 0x01 #define USB_REQ_SET_REPORT 0x09 #define ACD_USB_TIMEOUT 250 #define ACD_USB_EDID 0x0302 #define ACD_USB_BRIGHTNESS 0x0310 #define ACD_BTN_NONE 0 #define ACD_BTN_BRIGHT_UP 3 #define ACD_BTN_BRIGHT_DOWN 4 #define ACD_URB_BUFFER_LEN 2 #define ACD_MSG_BUFFER_LEN 2 #define APPLEDISPLAY_DEVICE(prod) \ .match_flags = USB_DEVICE_ID_MATCH_DEVICE | \ USB_DEVICE_ID_MATCH_INT_CLASS | \ USB_DEVICE_ID_MATCH_INT_PROTOCOL, \ .idVendor = APPLE_VENDOR_ID, \ .idProduct = (prod), \ .bInterfaceClass = USB_CLASS_HID, \ .bInterfaceProtocol = 0x00 /* table of devices that work with this driver */ static const struct usb_device_id appledisplay_table[] = { { APPLEDISPLAY_DEVICE(0x9218) }, { APPLEDISPLAY_DEVICE(0x9219) }, { APPLEDISPLAY_DEVICE(0x921c) }, { APPLEDISPLAY_DEVICE(0x921d) }, { APPLEDISPLAY_DEVICE(0x9222) }, { APPLEDISPLAY_DEVICE(0x9226) }, { APPLEDISPLAY_DEVICE(0x9236) }, /* Terminating entry */ { } }; MODULE_DEVICE_TABLE(usb, appledisplay_table); /* Structure to hold all of our device specific stuff */ struct appledisplay { struct usb_device *udev; /* usb device */ struct urb *urb; /* usb request block */ struct backlight_device *bd; /* backlight device */ u8 *urbdata; /* interrupt URB data buffer */ u8 *msgdata; /* control message data buffer */ struct delayed_work work; int button_pressed; struct mutex sysfslock; /* concurrent read and write */ }; static atomic_t count_displays = ATOMIC_INIT(0); static void appledisplay_complete(struct urb *urb) { struct appledisplay *pdata = urb->context; struct device *dev = &pdata->udev->dev; int status = urb->status; int retval; switch (status) { case 0: /* success */ break; case -EOVERFLOW: dev_err(dev, "OVERFLOW with data length %d, actual length is %d\n", ACD_URB_BUFFER_LEN, pdata->urb->actual_length); fallthrough; case -ECONNRESET: case -ENOENT: case -ESHUTDOWN: /* This urb is terminated, clean up */ dev_dbg(dev, "%s - urb shuttingdown with status: %d\n", __func__, status); return; default: dev_dbg(dev, "%s - nonzero urb status received: %d\n", __func__, status); goto exit; } switch(pdata->urbdata[1]) { case ACD_BTN_BRIGHT_UP: case ACD_BTN_BRIGHT_DOWN: pdata->button_pressed = 1; schedule_delayed_work(&pdata->work, 0); break; case ACD_BTN_NONE: default: pdata->button_pressed = 0; break; } exit: retval = usb_submit_urb(pdata->urb, GFP_ATOMIC); if (retval) { dev_err(dev, "%s - usb_submit_urb failed with result %d\n", __func__, retval); } } static int appledisplay_bl_update_status(struct backlight_device *bd) { struct appledisplay *pdata = bl_get_data(bd); int retval; mutex_lock(&pdata->sysfslock); pdata->msgdata[0] = 0x10; pdata->msgdata[1] = bd->props.brightness; retval = usb_control_msg( pdata->udev, usb_sndctrlpipe(pdata->udev, 0), USB_REQ_SET_REPORT, USB_DIR_OUT | USB_TYPE_CLASS | USB_RECIP_INTERFACE, ACD_USB_BRIGHTNESS, 0, pdata->msgdata, 2, ACD_USB_TIMEOUT); mutex_unlock(&pdata->sysfslock); if (retval < 0) return retval; else return 0; } static int appledisplay_bl_get_brightness(struct backlight_device *bd) { struct appledisplay *pdata = bl_get_data(bd); int retval, brightness; mutex_lock(&pdata->sysfslock); retval = usb_control_msg( pdata->udev, usb_rcvctrlpipe(pdata->udev, 0), USB_REQ_GET_REPORT, USB_DIR_IN | USB_TYPE_CLASS | USB_RECIP_INTERFACE, ACD_USB_BRIGHTNESS, 0, pdata->msgdata, 2, ACD_USB_TIMEOUT); if (retval < 2) { if (retval >= 0) retval = -EMSGSIZE; } else { brightness = pdata->msgdata[1]; } mutex_unlock(&pdata->sysfslock); if (retval < 0) return retval; else return brightness; } static const struct backlight_ops appledisplay_bl_data = { .get_brightness = appledisplay_bl_get_brightness, .update_status = appledisplay_bl_update_status, }; static void appledisplay_work(struct work_struct *work) { struct appledisplay *pdata = container_of(work, struct appledisplay, work.work); int retval; retval = appledisplay_bl_get_brightness(pdata->bd); if (retval >= 0) pdata->bd->props.brightness = retval; /* Poll again in about 125ms if there's still a button pressed */ if (pdata->button_pressed) schedule_delayed_work(&pdata->work, HZ / 8); } static int appledisplay_probe(struct usb_interface *iface, const struct usb_device_id *id) { struct backlight_properties props; struct appledisplay *pdata; struct usb_device *udev = interface_to_usbdev(iface); struct usb_endpoint_descriptor *endpoint; int int_in_endpointAddr = 0; int retval, brightness; char bl_name[20]; /* set up the endpoint information */ /* use only the first interrupt-in endpoint */ retval = usb_find_int_in_endpoint(iface->cur_altsetting, &endpoint); if (retval) { dev_err(&iface->dev, "Could not find int-in endpoint\n"); return retval; } int_in_endpointAddr = endpoint->bEndpointAddress; /* allocate memory for our device state and initialize it */ pdata = kzalloc(sizeof(struct appledisplay), GFP_KERNEL); if (!pdata) { retval = -ENOMEM; goto error; } pdata->udev = udev; INIT_DELAYED_WORK(&pdata->work, appledisplay_work); mutex_init(&pdata->sysfslock); /* Allocate buffer for control messages */ pdata->msgdata = kmalloc(ACD_MSG_BUFFER_LEN, GFP_KERNEL); if (!pdata->msgdata) { retval = -ENOMEM; goto error; } /* Allocate interrupt URB */ pdata->urb = usb_alloc_urb(0, GFP_KERNEL); if (!pdata->urb) { retval = -ENOMEM; goto error; } /* Allocate buffer for interrupt data */ pdata->urbdata = usb_alloc_coherent(pdata->udev, ACD_URB_BUFFER_LEN, GFP_KERNEL, &pdata->urb->transfer_dma); if (!pdata->urbdata) { retval = -ENOMEM; dev_err(&iface->dev, "Allocating URB buffer failed\n"); goto error; } /* Configure interrupt URB */ usb_fill_int_urb(pdata->urb, udev, usb_rcvintpipe(udev, int_in_endpointAddr), pdata->urbdata, ACD_URB_BUFFER_LEN, appledisplay_complete, pdata, 1); pdata->urb->transfer_flags = URB_NO_TRANSFER_DMA_MAP; if (usb_submit_urb(pdata->urb, GFP_KERNEL)) { retval = -EIO; dev_err(&iface->dev, "Submitting URB failed\n"); goto error; } /* Register backlight device */ snprintf(bl_name, sizeof(bl_name), "appledisplay%d", atomic_inc_return(&count_displays) - 1); memset(&props, 0, sizeof(struct backlight_properties)); props.type = BACKLIGHT_RAW; props.max_brightness = 0xff; pdata->bd = backlight_device_register(bl_name, NULL, pdata, &appledisplay_bl_data, &props); if (IS_ERR(pdata->bd)) { dev_err(&iface->dev, "Backlight registration failed\n"); retval = PTR_ERR(pdata->bd); goto error; } /* Try to get brightness */ brightness = appledisplay_bl_get_brightness(pdata->bd); if (brightness < 0) { retval = brightness; dev_err(&iface->dev, "Error while getting initial brightness: %d\n", retval); goto error; } /* Set brightness in backlight device */ pdata->bd->props.brightness = brightness; /* save our data pointer in the interface device */ usb_set_intfdata(iface, pdata); printk(KERN_INFO "appledisplay: Apple Cinema Display connected\n"); return 0; error: if (pdata) { if (pdata->urb) { usb_kill_urb(pdata->urb); cancel_delayed_work_sync(&pdata->work); usb_free_coherent(pdata->udev, ACD_URB_BUFFER_LEN, pdata->urbdata, pdata->urb->transfer_dma); usb_free_urb(pdata->urb); } if (!IS_ERR(pdata->bd)) backlight_device_unregister(pdata->bd); kfree(pdata->msgdata); } usb_set_intfdata(iface, NULL); kfree(pdata); return retval; } static void appledisplay_disconnect(struct usb_interface *iface) { struct appledisplay *pdata = usb_get_intfdata(iface); if (pdata) { usb_kill_urb(pdata->urb); cancel_delayed_work_sync(&pdata->work); backlight_device_unregister(pdata->bd); usb_free_coherent(pdata->udev, ACD_URB_BUFFER_LEN, pdata->urbdata, pdata->urb->transfer_dma); usb_free_urb(pdata->urb); kfree(pdata->msgdata); kfree(pdata); } printk(KERN_INFO "appledisplay: Apple Cinema Display disconnected\n"); } static struct usb_driver appledisplay_driver = { .name = "appledisplay", .probe = appledisplay_probe, .disconnect = appledisplay_disconnect, .id_table = appledisplay_table, }; module_usb_driver(appledisplay_driver); MODULE_AUTHOR("Michael Hanselmann"); MODULE_DESCRIPTION("Apple Cinema Display driver"); MODULE_LICENSE("GPL"); |
3 25 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 | /* SPDX-License-Identifier: GPL-2.0-or-later */ #ifndef __SOUND_CONTROL_H #define __SOUND_CONTROL_H /* * Header file for control interface * Copyright (c) by Jaroslav Kysela <perex@perex.cz> */ #include <linux/wait.h> #include <linux/nospec.h> #include <sound/asound.h> #define snd_kcontrol_chip(kcontrol) ((kcontrol)->private_data) struct snd_kcontrol; typedef int (snd_kcontrol_info_t) (struct snd_kcontrol * kcontrol, struct snd_ctl_elem_info * uinfo); typedef int (snd_kcontrol_get_t) (struct snd_kcontrol * kcontrol, struct snd_ctl_elem_value * ucontrol); typedef int (snd_kcontrol_put_t) (struct snd_kcontrol * kcontrol, struct snd_ctl_elem_value * ucontrol); typedef int (snd_kcontrol_tlv_rw_t)(struct snd_kcontrol *kcontrol, int op_flag, /* SNDRV_CTL_TLV_OP_XXX */ unsigned int size, unsigned int __user *tlv); /* internal flag for skipping validations */ #ifdef CONFIG_SND_CTL_DEBUG #define SNDRV_CTL_ELEM_ACCESS_SKIP_CHECK (1 << 24) #define snd_ctl_skip_validation(info) \ ((info)->access & SNDRV_CTL_ELEM_ACCESS_SKIP_CHECK) #else #define SNDRV_CTL_ELEM_ACCESS_SKIP_CHECK 0 #define snd_ctl_skip_validation(info) true #endif /* kernel only - LED bits */ #define SNDRV_CTL_ELEM_ACCESS_LED_SHIFT 25 #define SNDRV_CTL_ELEM_ACCESS_LED_MASK (7<<25) /* kernel three bits - LED group */ #define SNDRV_CTL_ELEM_ACCESS_SPK_LED (1<<25) /* kernel speaker (output) LED flag */ #define SNDRV_CTL_ELEM_ACCESS_MIC_LED (2<<25) /* kernel microphone (input) LED flag */ enum { SNDRV_CTL_TLV_OP_READ = 0, SNDRV_CTL_TLV_OP_WRITE = 1, SNDRV_CTL_TLV_OP_CMD = -1, }; struct snd_kcontrol_new { snd_ctl_elem_iface_t iface; /* interface identifier */ unsigned int device; /* device/client number */ unsigned int subdevice; /* subdevice (substream) number */ const char *name; /* ASCII name of item */ unsigned int index; /* index of item */ unsigned int access; /* access rights */ unsigned int count; /* count of same elements */ snd_kcontrol_info_t *info; snd_kcontrol_get_t *get; snd_kcontrol_put_t *put; union { snd_kcontrol_tlv_rw_t *c; const unsigned int *p; } tlv; unsigned long private_value; }; struct snd_kcontrol_volatile { struct snd_ctl_file *owner; /* locked */ unsigned int access; /* access rights */ }; struct snd_kcontrol { struct list_head list; /* list of controls */ struct snd_ctl_elem_id id; unsigned int count; /* count of same elements */ snd_kcontrol_info_t *info; snd_kcontrol_get_t *get; snd_kcontrol_put_t *put; union { snd_kcontrol_tlv_rw_t *c; const unsigned int *p; } tlv; unsigned long private_value; void *private_data; void (*private_free)(struct snd_kcontrol *kcontrol); struct snd_kcontrol_volatile vd[]; /* volatile data */ }; #define snd_kcontrol(n) list_entry(n, struct snd_kcontrol, list) struct snd_kctl_event { struct list_head list; /* list of events */ struct snd_ctl_elem_id id; unsigned int mask; }; #define snd_kctl_event(n) list_entry(n, struct snd_kctl_event, list) struct pid; enum { SND_CTL_SUBDEV_PCM, SND_CTL_SUBDEV_RAWMIDI, SND_CTL_SUBDEV_ITEMS, }; struct snd_ctl_file { struct list_head list; /* list of all control files */ struct snd_card *card; struct pid *pid; int preferred_subdevice[SND_CTL_SUBDEV_ITEMS]; wait_queue_head_t change_sleep; spinlock_t read_lock; struct snd_fasync *fasync; int subscribed; /* read interface is activated */ struct list_head events; /* waiting events for read */ }; struct snd_ctl_layer_ops { struct snd_ctl_layer_ops *next; const char *module_name; void (*lregister)(struct snd_card *card); void (*ldisconnect)(struct snd_card *card); void (*lnotify)(struct snd_card *card, unsigned int mask, struct snd_kcontrol *kctl, unsigned int ioff); }; #define snd_ctl_file(n) list_entry(n, struct snd_ctl_file, list) typedef int (*snd_kctl_ioctl_func_t) (struct snd_card * card, struct snd_ctl_file * control, unsigned int cmd, unsigned long arg); void snd_ctl_notify(struct snd_card * card, unsigned int mask, struct snd_ctl_elem_id * id); void snd_ctl_notify_one(struct snd_card * card, unsigned int mask, struct snd_kcontrol * kctl, unsigned int ioff); struct snd_kcontrol *snd_ctl_new1(const struct snd_kcontrol_new * kcontrolnew, void * private_data); void snd_ctl_free_one(struct snd_kcontrol * kcontrol); int snd_ctl_add(struct snd_card * card, struct snd_kcontrol * kcontrol); int snd_ctl_remove(struct snd_card * card, struct snd_kcontrol * kcontrol); int snd_ctl_replace(struct snd_card *card, struct snd_kcontrol *kcontrol, bool add_on_replace); int snd_ctl_remove_id(struct snd_card * card, struct snd_ctl_elem_id *id); int snd_ctl_rename_id(struct snd_card * card, struct snd_ctl_elem_id *src_id, struct snd_ctl_elem_id *dst_id); void snd_ctl_rename(struct snd_card *card, struct snd_kcontrol *kctl, const char *name); int snd_ctl_activate_id(struct snd_card *card, struct snd_ctl_elem_id *id, int active); struct snd_kcontrol *snd_ctl_find_numid_locked(struct snd_card *card, unsigned int numid); struct snd_kcontrol *snd_ctl_find_numid(struct snd_card *card, unsigned int numid); struct snd_kcontrol *snd_ctl_find_id_locked(struct snd_card *card, const struct snd_ctl_elem_id *id); struct snd_kcontrol *snd_ctl_find_id(struct snd_card *card, const struct snd_ctl_elem_id *id); /** * snd_ctl_find_id_mixer - find the control instance with the given name string * @card: the card instance * @name: the name string * * Finds the control instance with the given name and * @SNDRV_CTL_ELEM_IFACE_MIXER. Other fields are set to zero. * * This is merely a wrapper to snd_ctl_find_id(). * * Return: The pointer of the instance if found, or %NULL if not. */ static inline struct snd_kcontrol * snd_ctl_find_id_mixer(struct snd_card *card, const char *name) { struct snd_ctl_elem_id id = {}; id.iface = SNDRV_CTL_ELEM_IFACE_MIXER; strscpy(id.name, name, sizeof(id.name)); return snd_ctl_find_id(card, &id); } int snd_ctl_create(struct snd_card *card); int snd_ctl_register_ioctl(snd_kctl_ioctl_func_t fcn); int snd_ctl_unregister_ioctl(snd_kctl_ioctl_func_t fcn); #ifdef CONFIG_COMPAT int snd_ctl_register_ioctl_compat(snd_kctl_ioctl_func_t fcn); int snd_ctl_unregister_ioctl_compat(snd_kctl_ioctl_func_t fcn); #else #define snd_ctl_register_ioctl_compat(fcn) #define snd_ctl_unregister_ioctl_compat(fcn) #endif int snd_ctl_request_layer(const char *module_name); void snd_ctl_register_layer(struct snd_ctl_layer_ops *lops); void snd_ctl_disconnect_layer(struct snd_ctl_layer_ops *lops); int snd_ctl_get_preferred_subdevice(struct snd_card *card, int type); static inline unsigned int snd_ctl_get_ioffnum(struct snd_kcontrol *kctl, struct snd_ctl_elem_id *id) { unsigned int ioff = id->numid - kctl->id.numid; return array_index_nospec(ioff, kctl->count); } static inline unsigned int snd_ctl_get_ioffidx(struct snd_kcontrol *kctl, struct snd_ctl_elem_id *id) { unsigned int ioff = id->index - kctl->id.index; return array_index_nospec(ioff, kctl->count); } static inline unsigned int snd_ctl_get_ioff(struct snd_kcontrol *kctl, struct snd_ctl_elem_id *id) { if (id->numid) { return snd_ctl_get_ioffnum(kctl, id); } else { return snd_ctl_get_ioffidx(kctl, id); } } static inline struct snd_ctl_elem_id *snd_ctl_build_ioff(struct snd_ctl_elem_id *dst_id, struct snd_kcontrol *src_kctl, unsigned int offset) { *dst_id = src_kctl->id; dst_id->index += offset; dst_id->numid += offset; return dst_id; } /* * Frequently used control callbacks/helpers */ int snd_ctl_boolean_mono_info(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_info *uinfo); int snd_ctl_boolean_stereo_info(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_info *uinfo); int snd_ctl_enum_info(struct snd_ctl_elem_info *info, unsigned int channels, unsigned int items, const char *const names[]); /* * virtual master control */ struct snd_kcontrol *snd_ctl_make_virtual_master(char *name, const unsigned int *tlv); int _snd_ctl_add_follower(struct snd_kcontrol *master, struct snd_kcontrol *follower, unsigned int flags); /* optional flags for follower */ #define SND_CTL_FOLLOWER_NEED_UPDATE (1 << 0) /** * snd_ctl_add_follower - Add a virtual follower control * @master: vmaster element * @follower: follower element to add * * Add a virtual follower control to the given master element created via * snd_ctl_create_virtual_master() beforehand. * * All followers must be the same type (returning the same information * via info callback). The function doesn't check it, so it's your * responsibility. * * Also, some additional limitations: * at most two channels, * logarithmic volume control (dB level) thus no linear volume, * master can only attenuate the volume without gain * * Return: Zero if successful or a negative error code. */ static inline int snd_ctl_add_follower(struct snd_kcontrol *master, struct snd_kcontrol *follower) { return _snd_ctl_add_follower(master, follower, 0); } int snd_ctl_add_followers(struct snd_card *card, struct snd_kcontrol *master, const char * const *list); /** * snd_ctl_add_follower_uncached - Add a virtual follower control * @master: vmaster element * @follower: follower element to add * * Add a virtual follower control to the given master. * Unlike snd_ctl_add_follower(), the element added via this function * is supposed to have volatile values, and get callback is called * at each time queried from the master. * * When the control peeks the hardware values directly and the value * can be changed by other means than the put callback of the element, * this function should be used to keep the value always up-to-date. * * Return: Zero if successful or a negative error code. */ static inline int snd_ctl_add_follower_uncached(struct snd_kcontrol *master, struct snd_kcontrol *follower) { return _snd_ctl_add_follower(master, follower, SND_CTL_FOLLOWER_NEED_UPDATE); } int snd_ctl_add_vmaster_hook(struct snd_kcontrol *kctl, void (*hook)(void *private_data, int), void *private_data); void snd_ctl_sync_vmaster(struct snd_kcontrol *kctl, bool hook_only); #define snd_ctl_sync_vmaster_hook(kctl) snd_ctl_sync_vmaster(kctl, true) int snd_ctl_apply_vmaster_followers(struct snd_kcontrol *kctl, int (*func)(struct snd_kcontrol *vfollower, struct snd_kcontrol *follower, void *arg), void *arg); /* * Control LED trigger layer */ #define SND_CTL_LAYER_MODULE_LED "snd-ctl-led" #if IS_MODULE(CONFIG_SND_CTL_LED) static inline int snd_ctl_led_request(void) { return snd_ctl_request_layer(SND_CTL_LAYER_MODULE_LED); } #else static inline int snd_ctl_led_request(void) { return 0; } #endif /* * Helper functions for jack-detection controls */ struct snd_kcontrol * snd_kctl_jack_new(const char *name, struct snd_card *card); void snd_kctl_jack_report(struct snd_card *card, struct snd_kcontrol *kctl, bool status); #endif /* __SOUND_CONTROL_H */ |
35 35 912 912 911 912 912 2 3 3 3 2 2 2 6 1 5 8 3 5 6 1 75 27 1 2 2 1 29 29 2 2 2 2 1 1 1 42 21 21 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 1995 Linus Torvalds * * Pentium III FXSR, SSE support * Gareth Hughes <gareth@valinux.com>, May 2000 * * X86-64 port * Andi Kleen. * * CPU hotplug support - ashok.raj@intel.com */ /* * This file handles the architecture-dependent parts of process handling.. */ #include <linux/cpu.h> #include <linux/errno.h> #include <linux/sched.h> #include <linux/sched/task.h> #include <linux/sched/task_stack.h> #include <linux/fs.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/elfcore.h> #include <linux/smp.h> #include <linux/slab.h> #include <linux/user.h> #include <linux/interrupt.h> #include <linux/delay.h> #include <linux/export.h> #include <linux/ptrace.h> #include <linux/notifier.h> #include <linux/kprobes.h> #include <linux/kdebug.h> #include <linux/prctl.h> #include <linux/uaccess.h> #include <linux/io.h> #include <linux/ftrace.h> #include <linux/syscalls.h> #include <linux/iommu.h> #include <asm/processor.h> #include <asm/pkru.h> #include <asm/fpu/sched.h> #include <asm/mmu_context.h> #include <asm/prctl.h> #include <asm/desc.h> #include <asm/proto.h> #include <asm/ia32.h> #include <asm/debugreg.h> #include <asm/switch_to.h> #include <asm/xen/hypervisor.h> #include <asm/vdso.h> #include <asm/resctrl.h> #include <asm/unistd.h> #include <asm/fsgsbase.h> #ifdef CONFIG_IA32_EMULATION /* Not included via unistd.h */ #include <asm/unistd_32_ia32.h> #endif #include "process.h" /* Prints also some state that isn't saved in the pt_regs */ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode, const char *log_lvl) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; unsigned long d0, d1, d2, d3, d6, d7; unsigned int fsindex, gsindex; unsigned int ds, es; show_iret_regs(regs, log_lvl); if (regs->orig_ax != -1) pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax); else pr_cont("\n"); printk("%sRAX: %016lx RBX: %016lx RCX: %016lx\n", log_lvl, regs->ax, regs->bx, regs->cx); printk("%sRDX: %016lx RSI: %016lx RDI: %016lx\n", log_lvl, regs->dx, regs->si, regs->di); printk("%sRBP: %016lx R08: %016lx R09: %016lx\n", log_lvl, regs->bp, regs->r8, regs->r9); printk("%sR10: %016lx R11: %016lx R12: %016lx\n", log_lvl, regs->r10, regs->r11, regs->r12); printk("%sR13: %016lx R14: %016lx R15: %016lx\n", log_lvl, regs->r13, regs->r14, regs->r15); if (mode == SHOW_REGS_SHORT) return; if (mode == SHOW_REGS_USER) { rdmsrl(MSR_FS_BASE, fs); rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); printk("%sFS: %016lx GS: %016lx\n", log_lvl, fs, shadowgs); return; } asm("movl %%ds,%0" : "=r" (ds)); asm("movl %%es,%0" : "=r" (es)); asm("movl %%fs,%0" : "=r" (fsindex)); asm("movl %%gs,%0" : "=r" (gsindex)); rdmsrl(MSR_FS_BASE, fs); rdmsrl(MSR_GS_BASE, gs); rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); cr0 = read_cr0(); cr2 = read_cr2(); cr3 = __read_cr3(); cr4 = __read_cr4(); printk("%sFS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", log_lvl, fs, fsindex, gs, gsindex, shadowgs); printk("%sCS: %04lx DS: %04x ES: %04x CR0: %016lx\n", log_lvl, regs->cs, ds, es, cr0); printk("%sCR2: %016lx CR3: %016lx CR4: %016lx\n", log_lvl, cr2, cr3, cr4); get_debugreg(d0, 0); get_debugreg(d1, 1); get_debugreg(d2, 2); get_debugreg(d3, 3); get_debugreg(d6, 6); get_debugreg(d7, 7); /* Only print out debug registers if they are in their non-default state. */ if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) && (d6 == DR6_RESERVED) && (d7 == 0x400))) { printk("%sDR0: %016lx DR1: %016lx DR2: %016lx\n", log_lvl, d0, d1, d2); printk("%sDR3: %016lx DR6: %016lx DR7: %016lx\n", log_lvl, d3, d6, d7); } if (cpu_feature_enabled(X86_FEATURE_OSPKE)) printk("%sPKRU: %08x\n", log_lvl, read_pkru()); } void release_thread(struct task_struct *dead_task) { WARN_ON(dead_task->mm); } enum which_selector { FS, GS }; /* * Out of line to be protected from kprobes and tracing. If this would be * traced or probed than any access to a per CPU variable happens with * the wrong GS. * * It is not used on Xen paravirt. When paravirt support is needed, it * needs to be renamed with native_ prefix. */ static noinstr unsigned long __rdgsbase_inactive(void) { unsigned long gsbase; lockdep_assert_irqs_disabled(); if (!cpu_feature_enabled(X86_FEATURE_XENPV)) { native_swapgs(); gsbase = rdgsbase(); native_swapgs(); } else { instrumentation_begin(); rdmsrl(MSR_KERNEL_GS_BASE, gsbase); instrumentation_end(); } return gsbase; } /* * Out of line to be protected from kprobes and tracing. If this would be * traced or probed than any access to a per CPU variable happens with * the wrong GS. * * It is not used on Xen paravirt. When paravirt support is needed, it * needs to be renamed with native_ prefix. */ static noinstr void __wrgsbase_inactive(unsigned long gsbase) { lockdep_assert_irqs_disabled(); if (!cpu_feature_enabled(X86_FEATURE_XENPV)) { native_swapgs(); wrgsbase(gsbase); native_swapgs(); } else { instrumentation_begin(); wrmsrl(MSR_KERNEL_GS_BASE, gsbase); instrumentation_end(); } } /* * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are * not available. The goal is to be reasonably fast on non-FSGSBASE systems. * It's forcibly inlined because it'll generate better code and this function * is hot. */ static __always_inline void save_base_legacy(struct task_struct *prev_p, unsigned short selector, enum which_selector which) { if (likely(selector == 0)) { /* * On Intel (without X86_BUG_NULL_SEG), the segment base could * be the pre-existing saved base or it could be zero. On AMD * (with X86_BUG_NULL_SEG), the segment base could be almost * anything. * * This branch is very hot (it's hit twice on almost every * context switch between 64-bit programs), and avoiding * the RDMSR helps a lot, so we just assume that whatever * value is already saved is correct. This matches historical * Linux behavior, so it won't break existing applications. * * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we * report that the base is zero, it needs to actually be zero: * see the corresponding logic in load_seg_legacy. */ } else { /* * If the selector is 1, 2, or 3, then the base is zero on * !X86_BUG_NULL_SEG CPUs and could be anything on * X86_BUG_NULL_SEG CPUs. In the latter case, Linux * has never attempted to preserve the base across context * switches. * * If selector > 3, then it refers to a real segment, and * saving the base isn't necessary. */ if (which == FS) prev_p->thread.fsbase = 0; else prev_p->thread.gsbase = 0; } } static __always_inline void save_fsgs(struct task_struct *task) { savesegment(fs, task->thread.fsindex); savesegment(gs, task->thread.gsindex); if (static_cpu_has(X86_FEATURE_FSGSBASE)) { /* * If FSGSBASE is enabled, we can't make any useful guesses * about the base, and user code expects us to save the current * value. Fortunately, reading the base directly is efficient. */ task->thread.fsbase = rdfsbase(); task->thread.gsbase = __rdgsbase_inactive(); } else { save_base_legacy(task, task->thread.fsindex, FS); save_base_legacy(task, task->thread.gsindex, GS); } } /* * While a process is running,current->thread.fsbase and current->thread.gsbase * may not match the corresponding CPU registers (see save_base_legacy()). */ void current_save_fsgs(void) { unsigned long flags; /* Interrupts need to be off for FSGSBASE */ local_irq_save(flags); save_fsgs(current); local_irq_restore(flags); } #if IS_ENABLED(CONFIG_KVM) EXPORT_SYMBOL_GPL(current_save_fsgs); #endif static __always_inline void loadseg(enum which_selector which, unsigned short sel) { if (which == FS) loadsegment(fs, sel); else load_gs_index(sel); } static __always_inline void load_seg_legacy(unsigned short prev_index, unsigned long prev_base, unsigned short next_index, unsigned long next_base, enum which_selector which) { if (likely(next_index <= 3)) { /* * The next task is using 64-bit TLS, is not using this * segment at all, or is having fun with arcane CPU features. */ if (next_base == 0) { /* * Nasty case: on AMD CPUs, we need to forcibly zero * the base. */ if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { loadseg(which, __USER_DS); loadseg(which, next_index); } else { /* * We could try to exhaustively detect cases * under which we can skip the segment load, * but there's really only one case that matters * for performance: if both the previous and * next states are fully zeroed, we can skip * the load. * * (This assumes that prev_base == 0 has no * false positives. This is the case on * Intel-style CPUs.) */ if (likely(prev_index | next_index | prev_base)) loadseg(which, next_index); } } else { if (prev_index != next_index) loadseg(which, next_index); wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE, next_base); } } else { /* * The next task is using a real segment. Loading the selector * is sufficient. */ loadseg(which, next_index); } } /* * Store prev's PKRU value and load next's PKRU value if they differ. PKRU * is not XSTATE managed on context switch because that would require a * lookup in the task's FPU xsave buffer and require to keep that updated * in various places. */ static __always_inline void x86_pkru_load(struct thread_struct *prev, struct thread_struct *next) { if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) return; /* Stash the prev task's value: */ prev->pkru = rdpkru(); /* * PKRU writes are slightly expensive. Avoid them when not * strictly necessary: */ if (prev->pkru != next->pkru) wrpkru(next->pkru); } static __always_inline void x86_fsgsbase_load(struct thread_struct *prev, struct thread_struct *next) { if (static_cpu_has(X86_FEATURE_FSGSBASE)) { /* Update the FS and GS selectors if they could have changed. */ if (unlikely(prev->fsindex || next->fsindex)) loadseg(FS, next->fsindex); if (unlikely(prev->gsindex || next->gsindex)) loadseg(GS, next->gsindex); /* Update the bases. */ wrfsbase(next->fsbase); __wrgsbase_inactive(next->gsbase); } else { load_seg_legacy(prev->fsindex, prev->fsbase, next->fsindex, next->fsbase, FS); load_seg_legacy(prev->gsindex, prev->gsbase, next->gsindex, next->gsbase, GS); } } unsigned long x86_fsgsbase_read_task(struct task_struct *task, unsigned short selector) { unsigned short idx = selector >> 3; unsigned long base; if (likely((selector & SEGMENT_TI_MASK) == 0)) { if (unlikely(idx >= GDT_ENTRIES)) return 0; /* * There are no user segments in the GDT with nonzero bases * other than the TLS segments. */ if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) return 0; idx -= GDT_ENTRY_TLS_MIN; base = get_desc_base(&task->thread.tls_array[idx]); } else { #ifdef CONFIG_MODIFY_LDT_SYSCALL struct ldt_struct *ldt; /* * If performance here mattered, we could protect the LDT * with RCU. This is a slow path, though, so we can just * take the mutex. */ mutex_lock(&task->mm->context.lock); ldt = task->mm->context.ldt; if (unlikely(!ldt || idx >= ldt->nr_entries)) base = 0; else base = get_desc_base(ldt->entries + idx); mutex_unlock(&task->mm->context.lock); #else base = 0; #endif } return base; } unsigned long x86_gsbase_read_cpu_inactive(void) { unsigned long gsbase; if (boot_cpu_has(X86_FEATURE_FSGSBASE)) { unsigned long flags; local_irq_save(flags); gsbase = __rdgsbase_inactive(); local_irq_restore(flags); } else { rdmsrl(MSR_KERNEL_GS_BASE, gsbase); } return gsbase; } void x86_gsbase_write_cpu_inactive(unsigned long gsbase) { if (boot_cpu_has(X86_FEATURE_FSGSBASE)) { unsigned long flags; local_irq_save(flags); __wrgsbase_inactive(gsbase); local_irq_restore(flags); } else { wrmsrl(MSR_KERNEL_GS_BASE, gsbase); } } unsigned long x86_fsbase_read_task(struct task_struct *task) { unsigned long fsbase; if (task == current) fsbase = x86_fsbase_read_cpu(); else if (boot_cpu_has(X86_FEATURE_FSGSBASE) || (task->thread.fsindex == 0)) fsbase = task->thread.fsbase; else fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex); return fsbase; } unsigned long x86_gsbase_read_task(struct task_struct *task) { unsigned long gsbase; if (task == current) gsbase = x86_gsbase_read_cpu_inactive(); else if (boot_cpu_has(X86_FEATURE_FSGSBASE) || (task->thread.gsindex == 0)) gsbase = task->thread.gsbase; else gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex); return gsbase; } void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase) { WARN_ON_ONCE(task == current); task->thread.fsbase = fsbase; } void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase) { WARN_ON_ONCE(task == current); task->thread.gsbase = gsbase; } static void start_thread_common(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp, unsigned int _cs, unsigned int _ss, unsigned int _ds) { WARN_ON_ONCE(regs != current_pt_regs()); if (static_cpu_has(X86_BUG_NULL_SEG)) { /* Loading zero below won't clear the base. */ loadsegment(fs, __USER_DS); load_gs_index(__USER_DS); } reset_thread_features(); loadsegment(fs, 0); loadsegment(es, _ds); loadsegment(ds, _ds); load_gs_index(0); regs->ip = new_ip; regs->sp = new_sp; regs->cs = _cs; regs->ss = _ss; regs->flags = X86_EFLAGS_IF; } void start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) { start_thread_common(regs, new_ip, new_sp, __USER_CS, __USER_DS, 0); } EXPORT_SYMBOL_GPL(start_thread); #ifdef CONFIG_COMPAT void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp, bool x32) { start_thread_common(regs, new_ip, new_sp, x32 ? __USER_CS : __USER32_CS, __USER_DS, __USER_DS); } #endif /* * switch_to(x,y) should switch tasks from x to y. * * This could still be optimized: * - fold all the options into a flag word and test it with a single test. * - could test fs/gs bitsliced * * Kprobes not supported here. Set the probe on schedule instead. * Function graph tracer not supported too. */ __no_kmsan_checks __visible __notrace_funcgraph struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread; struct thread_struct *next = &next_p->thread; struct fpu *prev_fpu = &prev->fpu; int cpu = smp_processor_id(); WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && this_cpu_read(pcpu_hot.hardirq_stack_inuse)); if (!test_thread_flag(TIF_NEED_FPU_LOAD)) switch_fpu_prepare(prev_fpu, cpu); /* We must save %fs and %gs before load_TLS() because * %fs and %gs may be cleared by load_TLS(). * * (e.g. xen_load_tls()) */ save_fsgs(prev_p); /* * Load TLS before restoring any segments so that segment loads * reference the correct GDT entries. */ load_TLS(next, cpu); /* * Leave lazy mode, flushing any hypercalls made here. This * must be done after loading TLS entries in the GDT but before * loading segments that might reference them. */ arch_end_context_switch(next_p); /* Switch DS and ES. * * Reading them only returns the selectors, but writing them (if * nonzero) loads the full descriptor from the GDT or LDT. The * LDT for next is loaded in switch_mm, and the GDT is loaded * above. * * We therefore need to write new values to the segment * registers on every context switch unless both the new and old * values are zero. * * Note that we don't need to do anything for CS and SS, as * those are saved and restored as part of pt_regs. */ savesegment(es, prev->es); if (unlikely(next->es | prev->es)) loadsegment(es, next->es); savesegment(ds, prev->ds); if (unlikely(next->ds | prev->ds)) loadsegment(ds, next->ds); x86_fsgsbase_load(prev, next); x86_pkru_load(prev, next); /* * Switch the PDA and FPU contexts. */ raw_cpu_write(pcpu_hot.current_task, next_p); raw_cpu_write(pcpu_hot.top_of_stack, task_top_of_stack(next_p)); switch_fpu_finish(); /* Reload sp0. */ update_task_stack(next_p); switch_to_extra(prev_p, next_p); if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) { /* * AMD CPUs have a misfeature: SYSRET sets the SS selector but * does not update the cached descriptor. As a result, if we * do SYSRET while SS is NULL, we'll end up in user mode with * SS apparently equal to __USER_DS but actually unusable. * * The straightforward workaround would be to fix it up just * before SYSRET, but that would slow down the system call * fast paths. Instead, we ensure that SS is never NULL in * system call context. We do this by replacing NULL SS * selectors at every context switch. SYSCALL sets up a valid * SS, so the only way to get NULL is to re-enter the kernel * from CPL 3 through an interrupt. Since that can't happen * in the same task as a running syscall, we are guaranteed to * context switch between every interrupt vector entry and a * subsequent SYSRET. * * We read SS first because SS reads are much faster than * writes. Out of caution, we force SS to __KERNEL_DS even if * it previously had a different non-NULL value. */ unsigned short ss_sel; savesegment(ss, ss_sel); if (ss_sel != __KERNEL_DS) loadsegment(ss, __KERNEL_DS); } /* Load the Intel cache allocation PQR MSR. */ resctrl_sched_in(next_p); return prev_p; } void set_personality_64bit(void) { /* inherit personality from parent */ /* Make sure to be in 64bit mode */ clear_thread_flag(TIF_ADDR32); /* Pretend that this comes from a 64bit execve */ task_pt_regs(current)->orig_ax = __NR_execve; current_thread_info()->status &= ~TS_COMPAT; if (current->mm) __set_bit(MM_CONTEXT_HAS_VSYSCALL, ¤t->mm->context.flags); /* TBD: overwrites user setup. Should have two bits. But 64bit processes have always behaved this way, so it's not too bad. The main problem is just that 32bit children are affected again. */ current->personality &= ~READ_IMPLIES_EXEC; } static void __set_personality_x32(void) { #ifdef CONFIG_X86_X32_ABI if (current->mm) current->mm->context.flags = 0; current->personality &= ~READ_IMPLIES_EXEC; /* * in_32bit_syscall() uses the presence of the x32 syscall bit * flag to determine compat status. The x86 mmap() code relies on * the syscall bitness so set x32 syscall bit right here to make * in_32bit_syscall() work during exec(). * * Pretend to come from a x32 execve. */ task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT; current_thread_info()->status &= ~TS_COMPAT; #endif } static void __set_personality_ia32(void) { #ifdef CONFIG_IA32_EMULATION if (current->mm) { /* * uprobes applied to this MM need to know this and * cannot use user_64bit_mode() at that time. */ __set_bit(MM_CONTEXT_UPROBE_IA32, ¤t->mm->context.flags); } current->personality |= force_personality32; /* Prepare the first "return" to user space */ task_pt_regs(current)->orig_ax = __NR_ia32_execve; current_thread_info()->status |= TS_COMPAT; #endif } void set_personality_ia32(bool x32) { /* Make sure to be in 32bit mode */ set_thread_flag(TIF_ADDR32); if (x32) __set_personality_x32(); else __set_personality_ia32(); } EXPORT_SYMBOL_GPL(set_personality_ia32); #ifdef CONFIG_CHECKPOINT_RESTORE static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr) { int ret; ret = map_vdso_once(image, addr); if (ret) return ret; return (long)image->size; } #endif #ifdef CONFIG_ADDRESS_MASKING #define LAM_U57_BITS 6 static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits) { if (!cpu_feature_enabled(X86_FEATURE_LAM)) return -ENODEV; /* PTRACE_ARCH_PRCTL */ if (current->mm != mm) return -EINVAL; if (mm_valid_pasid(mm) && !test_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &mm->context.flags)) return -EINVAL; if (mmap_write_lock_killable(mm)) return -EINTR; if (test_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags)) { mmap_write_unlock(mm); return -EBUSY; } if (!nr_bits) { mmap_write_unlock(mm); return -EINVAL; } else if (nr_bits <= LAM_U57_BITS) { mm->context.lam_cr3_mask = X86_CR3_LAM_U57; mm->context.untag_mask = ~GENMASK(62, 57); } else { mmap_write_unlock(mm); return -EINVAL; } write_cr3(__read_cr3() | mm->context.lam_cr3_mask); set_tlbstate_lam_mode(mm); set_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags); mmap_write_unlock(mm); return 0; } #endif long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2) { int ret = 0; switch (option) { case ARCH_SET_GS: { if (unlikely(arg2 >= TASK_SIZE_MAX)) return -EPERM; preempt_disable(); /* * ARCH_SET_GS has always overwritten the index * and the base. Zero is the most sensible value * to put in the index, and is the only value that * makes any sense if FSGSBASE is unavailable. */ if (task == current) { loadseg(GS, 0); x86_gsbase_write_cpu_inactive(arg2); /* * On non-FSGSBASE systems, save_base_legacy() expects * that we also fill in thread.gsbase. */ task->thread.gsbase = arg2; } else { task->thread.gsindex = 0; x86_gsbase_write_task(task, arg2); } preempt_enable(); break; } case ARCH_SET_FS: { /* * Not strictly needed for %fs, but do it for symmetry * with %gs */ if (unlikely(arg2 >= TASK_SIZE_MAX)) return -EPERM; preempt_disable(); /* * Set the selector to 0 for the same reason * as %gs above. */ if (task == current) { loadseg(FS, 0); x86_fsbase_write_cpu(arg2); /* * On non-FSGSBASE systems, save_base_legacy() expects * that we also fill in thread.fsbase. */ task->thread.fsbase = arg2; } else { task->thread.fsindex = 0; x86_fsbase_write_task(task, arg2); } preempt_enable(); break; } case ARCH_GET_FS: { unsigned long base = x86_fsbase_read_task(task); ret = put_user(base, (unsigned long __user *)arg2); break; } case ARCH_GET_GS: { unsigned long base = x86_gsbase_read_task(task); ret = put_user(base, (unsigned long __user *)arg2); break; } #ifdef CONFIG_CHECKPOINT_RESTORE # ifdef CONFIG_X86_X32_ABI case ARCH_MAP_VDSO_X32: return prctl_map_vdso(&vdso_image_x32, arg2); # endif # if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION case ARCH_MAP_VDSO_32: return prctl_map_vdso(&vdso_image_32, arg2); # endif case ARCH_MAP_VDSO_64: return prctl_map_vdso(&vdso_image_64, arg2); #endif #ifdef CONFIG_ADDRESS_MASKING case ARCH_GET_UNTAG_MASK: return put_user(task->mm->context.untag_mask, (unsigned long __user *)arg2); case ARCH_ENABLE_TAGGED_ADDR: return prctl_enable_tagged_addr(task->mm, arg2); case ARCH_FORCE_TAGGED_SVA: if (current != task) return -EINVAL; set_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &task->mm->context.flags); return 0; case ARCH_GET_MAX_TAG_BITS: if (!cpu_feature_enabled(X86_FEATURE_LAM)) return put_user(0, (unsigned long __user *)arg2); else return put_user(LAM_U57_BITS, (unsigned long __user *)arg2); #endif case ARCH_SHSTK_ENABLE: case ARCH_SHSTK_DISABLE: case ARCH_SHSTK_LOCK: case ARCH_SHSTK_UNLOCK: case ARCH_SHSTK_STATUS: return shstk_prctl(task, option, arg2); default: ret = -EINVAL; break; } return ret; } SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) { long ret; ret = do_arch_prctl_64(current, option, arg2); if (ret == -EINVAL) ret = do_arch_prctl_common(option, arg2); return ret; } #ifdef CONFIG_IA32_EMULATION COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) { return do_arch_prctl_common(option, arg2); } #endif unsigned long KSTK_ESP(struct task_struct *task) { return task_pt_regs(task)->sp; } |
42 13 301 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 | // SPDX-License-Identifier: GPL-2.0-only /* * Unified UUID/GUID definition * * Copyright (C) 2009, 2016 Intel Corp. * Huang Ying <ying.huang@intel.com> */ #include <linux/kernel.h> #include <linux/ctype.h> #include <linux/errno.h> #include <linux/export.h> #include <linux/uuid.h> #include <linux/random.h> const guid_t guid_null; EXPORT_SYMBOL(guid_null); const uuid_t uuid_null; EXPORT_SYMBOL(uuid_null); const u8 guid_index[16] = {3,2,1,0,5,4,7,6,8,9,10,11,12,13,14,15}; const u8 uuid_index[16] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; /** * generate_random_uuid - generate a random UUID * @uuid: where to put the generated UUID * * Random UUID interface * * Used to create a Boot ID or a filesystem UUID/GUID, but can be * useful for other kernel drivers. */ void generate_random_uuid(unsigned char uuid[16]) { get_random_bytes(uuid, 16); /* Set UUID version to 4 --- truly random generation */ uuid[6] = (uuid[6] & 0x0F) | 0x40; /* Set the UUID variant to DCE */ uuid[8] = (uuid[8] & 0x3F) | 0x80; } EXPORT_SYMBOL(generate_random_uuid); void generate_random_guid(unsigned char guid[16]) { get_random_bytes(guid, 16); /* Set GUID version to 4 --- truly random generation */ guid[7] = (guid[7] & 0x0F) | 0x40; /* Set the GUID variant to DCE */ guid[8] = (guid[8] & 0x3F) | 0x80; } EXPORT_SYMBOL(generate_random_guid); static void __uuid_gen_common(__u8 b[16]) { get_random_bytes(b, 16); /* reversion 0b10 */ b[8] = (b[8] & 0x3F) | 0x80; } void guid_gen(guid_t *lu) { __uuid_gen_common(lu->b); /* version 4 : random generation */ lu->b[7] = (lu->b[7] & 0x0F) | 0x40; } EXPORT_SYMBOL_GPL(guid_gen); void uuid_gen(uuid_t *bu) { __uuid_gen_common(bu->b); /* version 4 : random generation */ bu->b[6] = (bu->b[6] & 0x0F) | 0x40; } EXPORT_SYMBOL_GPL(uuid_gen); /** * uuid_is_valid - checks if a UUID string is valid * @uuid: UUID string to check * * Description: * It checks if the UUID string is following the format: * xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx * * where x is a hex digit. * * Return: true if input is valid UUID string. */ bool uuid_is_valid(const char *uuid) { unsigned int i; for (i = 0; i < UUID_STRING_LEN; i++) { if (i == 8 || i == 13 || i == 18 || i == 23) { if (uuid[i] != '-') return false; } else if (!isxdigit(uuid[i])) { return false; } } return true; } EXPORT_SYMBOL(uuid_is_valid); static int __uuid_parse(const char *uuid, __u8 b[16], const u8 ei[16]) { static const u8 si[16] = {0,2,4,6,9,11,14,16,19,21,24,26,28,30,32,34}; unsigned int i; if (!uuid_is_valid(uuid)) return -EINVAL; for (i = 0; i < 16; i++) { int hi = hex_to_bin(uuid[si[i] + 0]); int lo = hex_to_bin(uuid[si[i] + 1]); b[ei[i]] = (hi << 4) | lo; } return 0; } int guid_parse(const char *uuid, guid_t *u) { return __uuid_parse(uuid, u->b, guid_index); } EXPORT_SYMBOL(guid_parse); int uuid_parse(const char *uuid, uuid_t *u) { return __uuid_parse(uuid, u->b, uuid_index); } EXPORT_SYMBOL(uuid_parse); |
1 1 191 78 78 40 191 191 12 191 191 155 191 191 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 | // SPDX-License-Identifier: GPL-2.0 /* * Manage cache of swap slots to be used for and returned from * swap. * * Copyright(c) 2016 Intel Corporation. * * Author: Tim Chen <tim.c.chen@linux.intel.com> * * We allocate the swap slots from the global pool and put * it into local per cpu caches. This has the advantage * of no needing to acquire the swap_info lock every time * we need a new slot. * * There is also opportunity to simply return the slot * to local caches without needing to acquire swap_info * lock. We do not reuse the returned slots directly but * move them back to the global pool in a batch. This * allows the slots to coalesce and reduce fragmentation. * * The swap entry allocated is marked with SWAP_HAS_CACHE * flag in map_count that prevents it from being allocated * again from the global pool. * * The swap slots cache is protected by a mutex instead of * a spin lock as when we search for slots with scan_swap_map, * we can possibly sleep. */ #include <linux/swap_slots.h> #include <linux/cpu.h> #include <linux/cpumask.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/mutex.h> #include <linux/mm.h> static DEFINE_PER_CPU(struct swap_slots_cache, swp_slots); static bool swap_slot_cache_active; bool swap_slot_cache_enabled; static bool swap_slot_cache_initialized; static DEFINE_MUTEX(swap_slots_cache_mutex); /* Serialize swap slots cache enable/disable operations */ static DEFINE_MUTEX(swap_slots_cache_enable_mutex); static void __drain_swap_slots_cache(unsigned int type); #define use_swap_slot_cache (swap_slot_cache_active && swap_slot_cache_enabled) #define SLOTS_CACHE 0x1 #define SLOTS_CACHE_RET 0x2 static void deactivate_swap_slots_cache(void) { mutex_lock(&swap_slots_cache_mutex); swap_slot_cache_active = false; __drain_swap_slots_cache(SLOTS_CACHE|SLOTS_CACHE_RET); mutex_unlock(&swap_slots_cache_mutex); } static void reactivate_swap_slots_cache(void) { mutex_lock(&swap_slots_cache_mutex); swap_slot_cache_active = true; mutex_unlock(&swap_slots_cache_mutex); } /* Must not be called with cpu hot plug lock */ void disable_swap_slots_cache_lock(void) { mutex_lock(&swap_slots_cache_enable_mutex); swap_slot_cache_enabled = false; if (swap_slot_cache_initialized) { /* serialize with cpu hotplug operations */ cpus_read_lock(); __drain_swap_slots_cache(SLOTS_CACHE|SLOTS_CACHE_RET); cpus_read_unlock(); } } static void __reenable_swap_slots_cache(void) { swap_slot_cache_enabled = has_usable_swap(); } void reenable_swap_slots_cache_unlock(void) { __reenable_swap_slots_cache(); mutex_unlock(&swap_slots_cache_enable_mutex); } static bool check_cache_active(void) { long pages; if (!swap_slot_cache_enabled) return false; pages = get_nr_swap_pages(); if (!swap_slot_cache_active) { if (pages > num_online_cpus() * THRESHOLD_ACTIVATE_SWAP_SLOTS_CACHE) reactivate_swap_slots_cache(); goto out; } /* if global pool of slot caches too low, deactivate cache */ if (pages < num_online_cpus() * THRESHOLD_DEACTIVATE_SWAP_SLOTS_CACHE) deactivate_swap_slots_cache(); out: return swap_slot_cache_active; } static int alloc_swap_slot_cache(unsigned int cpu) { struct swap_slots_cache *cache; swp_entry_t *slots, *slots_ret; /* * Do allocation outside swap_slots_cache_mutex * as kvzalloc could trigger reclaim and folio_alloc_swap, * which can lock swap_slots_cache_mutex. */ slots = kvcalloc(SWAP_SLOTS_CACHE_SIZE, sizeof(swp_entry_t), GFP_KERNEL); if (!slots) return -ENOMEM; slots_ret = kvcalloc(SWAP_SLOTS_CACHE_SIZE, sizeof(swp_entry_t), GFP_KERNEL); if (!slots_ret) { kvfree(slots); return -ENOMEM; } mutex_lock(&swap_slots_cache_mutex); cache = &per_cpu(swp_slots, cpu); if (cache->slots || cache->slots_ret) { /* cache already allocated */ mutex_unlock(&swap_slots_cache_mutex); kvfree(slots); kvfree(slots_ret); return 0; } if (!cache->lock_initialized) { mutex_init(&cache->alloc_lock); spin_lock_init(&cache->free_lock); cache->lock_initialized = true; } cache->nr = 0; cache->cur = 0; cache->n_ret = 0; /* * We initialized alloc_lock and free_lock earlier. We use * !cache->slots or !cache->slots_ret to know if it is safe to acquire * the corresponding lock and use the cache. Memory barrier below * ensures the assumption. */ mb(); cache->slots = slots; cache->slots_ret = slots_ret; mutex_unlock(&swap_slots_cache_mutex); return 0; } static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type, bool free_slots) { struct swap_slots_cache *cache; swp_entry_t *slots = NULL; cache = &per_cpu(swp_slots, cpu); if ((type & SLOTS_CACHE) && cache->slots) { mutex_lock(&cache->alloc_lock); swapcache_free_entries(cache->slots + cache->cur, cache->nr); cache->cur = 0; cache->nr = 0; if (free_slots && cache->slots) { kvfree(cache->slots); cache->slots = NULL; } mutex_unlock(&cache->alloc_lock); } if ((type & SLOTS_CACHE_RET) && cache->slots_ret) { spin_lock_irq(&cache->free_lock); swapcache_free_entries(cache->slots_ret, cache->n_ret); cache->n_ret = 0; if (free_slots && cache->slots_ret) { slots = cache->slots_ret; cache->slots_ret = NULL; } spin_unlock_irq(&cache->free_lock); kvfree(slots); } } static void __drain_swap_slots_cache(unsigned int type) { unsigned int cpu; /* * This function is called during * 1) swapoff, when we have to make sure no * left over slots are in cache when we remove * a swap device; * 2) disabling of swap slot cache, when we run low * on swap slots when allocating memory and need * to return swap slots to global pool. * * We cannot acquire cpu hot plug lock here as * this function can be invoked in the cpu * hot plug path: * cpu_up -> lock cpu_hotplug -> cpu hotplug state callback * -> memory allocation -> direct reclaim -> folio_alloc_swap * -> drain_swap_slots_cache * * Hence the loop over current online cpu below could miss cpu that * is being brought online but not yet marked as online. * That is okay as we do not schedule and run anything on a * cpu before it has been marked online. Hence, we will not * fill any swap slots in slots cache of such cpu. * There are no slots on such cpu that need to be drained. */ for_each_online_cpu(cpu) drain_slots_cache_cpu(cpu, type, false); } static int free_slot_cache(unsigned int cpu) { mutex_lock(&swap_slots_cache_mutex); drain_slots_cache_cpu(cpu, SLOTS_CACHE | SLOTS_CACHE_RET, true); mutex_unlock(&swap_slots_cache_mutex); return 0; } void enable_swap_slots_cache(void) { mutex_lock(&swap_slots_cache_enable_mutex); if (!swap_slot_cache_initialized) { int ret; ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "swap_slots_cache", alloc_swap_slot_cache, free_slot_cache); if (WARN_ONCE(ret < 0, "Cache allocation failed (%s), operating " "without swap slots cache.\n", __func__)) goto out_unlock; swap_slot_cache_initialized = true; } __reenable_swap_slots_cache(); out_unlock: mutex_unlock(&swap_slots_cache_enable_mutex); } /* called with swap slot cache's alloc lock held */ static int refill_swap_slots_cache(struct swap_slots_cache *cache) { if (!use_swap_slot_cache) return 0; cache->cur = 0; if (swap_slot_cache_active) cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, cache->slots, 1); return cache->nr; } void free_swap_slot(swp_entry_t entry) { struct swap_slots_cache *cache; cache = raw_cpu_ptr(&swp_slots); if (likely(use_swap_slot_cache && cache->slots_ret)) { spin_lock_irq(&cache->free_lock); /* Swap slots cache may be deactivated before acquiring lock */ if (!use_swap_slot_cache || !cache->slots_ret) { spin_unlock_irq(&cache->free_lock); goto direct_free; } if (cache->n_ret >= SWAP_SLOTS_CACHE_SIZE) { /* * Return slots to global pool. * The current swap_map value is SWAP_HAS_CACHE. * Set it to 0 to indicate it is available for * allocation in global pool */ swapcache_free_entries(cache->slots_ret, cache->n_ret); cache->n_ret = 0; } cache->slots_ret[cache->n_ret++] = entry; spin_unlock_irq(&cache->free_lock); } else { direct_free: swapcache_free_entries(&entry, 1); } } swp_entry_t folio_alloc_swap(struct folio *folio) { swp_entry_t entry; struct swap_slots_cache *cache; entry.val = 0; if (folio_test_large(folio)) { if (IS_ENABLED(CONFIG_THP_SWAP) && arch_thp_swp_supported()) get_swap_pages(1, &entry, folio_nr_pages(folio)); goto out; } /* * Preemption is allowed here, because we may sleep * in refill_swap_slots_cache(). But it is safe, because * accesses to the per-CPU data structure are protected by the * mutex cache->alloc_lock. * * The alloc path here does not touch cache->slots_ret * so cache->free_lock is not taken. */ cache = raw_cpu_ptr(&swp_slots); if (likely(check_cache_active() && cache->slots)) { mutex_lock(&cache->alloc_lock); if (cache->slots) { repeat: if (cache->nr) { entry = cache->slots[cache->cur]; cache->slots[cache->cur++].val = 0; cache->nr--; } else if (refill_swap_slots_cache(cache)) { goto repeat; } } mutex_unlock(&cache->alloc_lock); if (entry.val) goto out; } get_swap_pages(1, &entry, 1); out: if (mem_cgroup_try_charge_swap(folio, entry)) { put_swap_folio(folio, entry); entry.val = 0; } return entry; } |
384 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 | // SPDX-License-Identifier: GPL-2.0-only /* * Common code for control of lockd and nfsv4 grace periods. * * Transplanted from lockd code */ #include <linux/module.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <linux/fs.h> #include <linux/filelock.h> static unsigned int grace_net_id; static DEFINE_SPINLOCK(grace_lock); /** * locks_start_grace * @net: net namespace that this lock manager belongs to * @lm: who this grace period is for * * A grace period is a period during which locks should not be given * out. Currently grace periods are only enforced by the two lock * managers (lockd and nfsd), using the locks_in_grace() function to * check when they are in a grace period. * * This function is called to start a grace period. */ void locks_start_grace(struct net *net, struct lock_manager *lm) { struct list_head *grace_list = net_generic(net, grace_net_id); spin_lock(&grace_lock); if (list_empty(&lm->list)) list_add(&lm->list, grace_list); else WARN(1, "double list_add attempt detected in net %x %s\n", net->ns.inum, (net == &init_net) ? "(init_net)" : ""); spin_unlock(&grace_lock); } EXPORT_SYMBOL_GPL(locks_start_grace); /** * locks_end_grace * @lm: who this grace period is for * * Call this function to state that the given lock manager is ready to * resume regular locking. The grace period will not end until all lock * managers that called locks_start_grace() also call locks_end_grace(). * Note that callers count on it being safe to call this more than once, * and the second call should be a no-op. */ void locks_end_grace(struct lock_manager *lm) { spin_lock(&grace_lock); list_del_init(&lm->list); spin_unlock(&grace_lock); } EXPORT_SYMBOL_GPL(locks_end_grace); static bool __state_in_grace(struct net *net, bool open) { struct list_head *grace_list = net_generic(net, grace_net_id); struct lock_manager *lm; if (!open) return !list_empty(grace_list); spin_lock(&grace_lock); list_for_each_entry(lm, grace_list, list) { if (lm->block_opens) { spin_unlock(&grace_lock); return true; } } spin_unlock(&grace_lock); return false; } /** * locks_in_grace * @net: network namespace * * Lock managers call this function to determine when it is OK for them * to answer ordinary lock requests, and when they should accept only * lock reclaims. */ bool locks_in_grace(struct net *net) { return __state_in_grace(net, false); } EXPORT_SYMBOL_GPL(locks_in_grace); bool opens_in_grace(struct net *net) { return __state_in_grace(net, true); } EXPORT_SYMBOL_GPL(opens_in_grace); static int __net_init grace_init_net(struct net *net) { struct list_head *grace_list = net_generic(net, grace_net_id); INIT_LIST_HEAD(grace_list); return 0; } static void __net_exit grace_exit_net(struct net *net) { struct list_head *grace_list = net_generic(net, grace_net_id); WARN_ONCE(!list_empty(grace_list), "net %x %s: grace_list is not empty\n", net->ns.inum, __func__); } static struct pernet_operations grace_net_ops = { .init = grace_init_net, .exit = grace_exit_net, .id = &grace_net_id, .size = sizeof(struct list_head), }; static int __init init_grace(void) { return register_pernet_subsys(&grace_net_ops); } static void __exit exit_grace(void) { unregister_pernet_subsys(&grace_net_ops); } MODULE_AUTHOR("Jeff Layton <jlayton@primarydata.com>"); MODULE_LICENSE("GPL"); module_init(init_grace) module_exit(exit_grace) |
2 2 2 2 2 2 2 154 154 154 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 | // SPDX-License-Identifier: GPL-2.0-or-later /* * IPVS: Locality-Based Least-Connection scheduling module * * Authors: Wensong Zhang <wensong@gnuchina.org> * * Changes: * Martin Hamilton : fixed the terrible locking bugs * *lock(tbl->lock) ==> *lock(&tbl->lock) * Wensong Zhang : fixed the uninitialized tbl->lock bug * Wensong Zhang : added doing full expiration check to * collect stale entries of 24+ hours when * no partial expire check in a half hour * Julian Anastasov : replaced del_timer call with del_timer_sync * to avoid the possible race between timer * handler and del_timer thread in SMP */ /* * The lblc algorithm is as follows (pseudo code): * * if cachenode[dest_ip] is null then * n, cachenode[dest_ip] <- {weighted least-conn node}; * else * n <- cachenode[dest_ip]; * if (n is dead) OR * (n.conns>n.weight AND * there is a node m with m.conns<m.weight/2) then * n, cachenode[dest_ip] <- {weighted least-conn node}; * * return n; * * Thanks must go to Wenzhuo Zhang for talking WCCP to me and pushing * me to write this module. */ #define KMSG_COMPONENT "IPVS" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include <linux/ip.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/jiffies.h> #include <linux/hash.h> /* for sysctl */ #include <linux/fs.h> #include <linux/sysctl.h> #include <net/ip_vs.h> /* * It is for garbage collection of stale IPVS lblc entries, * when the table is full. */ #define CHECK_EXPIRE_INTERVAL (60*HZ) #define ENTRY_TIMEOUT (6*60*HZ) #define DEFAULT_EXPIRATION (24*60*60*HZ) /* * It is for full expiration check. * When there is no partial expiration check (garbage collection) * in a half hour, do a full expiration check to collect stale * entries that haven't been touched for a day. */ #define COUNT_FOR_FULL_EXPIRATION 30 /* * for IPVS lblc entry hash table */ #ifndef CONFIG_IP_VS_LBLC_TAB_BITS #define CONFIG_IP_VS_LBLC_TAB_BITS 10 #endif #define IP_VS_LBLC_TAB_BITS CONFIG_IP_VS_LBLC_TAB_BITS #define IP_VS_LBLC_TAB_SIZE (1 << IP_VS_LBLC_TAB_BITS) #define IP_VS_LBLC_TAB_MASK (IP_VS_LBLC_TAB_SIZE - 1) /* * IPVS lblc entry represents an association between destination * IP address and its destination server */ struct ip_vs_lblc_entry { struct hlist_node list; int af; /* address family */ union nf_inet_addr addr; /* destination IP address */ struct ip_vs_dest *dest; /* real server (cache) */ unsigned long lastuse; /* last used time */ struct rcu_head rcu_head; }; /* * IPVS lblc hash table */ struct ip_vs_lblc_table { struct rcu_head rcu_head; struct hlist_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */ struct timer_list periodic_timer; /* collect stale entries */ struct ip_vs_service *svc; /* pointer back to service */ atomic_t entries; /* number of entries */ int max_size; /* maximum size of entries */ int rover; /* rover for expire check */ int counter; /* counter for no expire */ bool dead; }; /* * IPVS LBLC sysctl table */ #ifdef CONFIG_SYSCTL static struct ctl_table vs_vars_table[] = { { .procname = "lblc_expiration", .data = NULL, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { } }; #endif static void ip_vs_lblc_rcu_free(struct rcu_head *head) { struct ip_vs_lblc_entry *en = container_of(head, struct ip_vs_lblc_entry, rcu_head); ip_vs_dest_put_and_free(en->dest); kfree(en); } static inline void ip_vs_lblc_del(struct ip_vs_lblc_entry *en) { hlist_del_rcu(&en->list); call_rcu(&en->rcu_head, ip_vs_lblc_rcu_free); } /* * Returns hash value for IPVS LBLC entry */ static inline unsigned int ip_vs_lblc_hashkey(int af, const union nf_inet_addr *addr) { __be32 addr_fold = addr->ip; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) addr_fold = addr->ip6[0]^addr->ip6[1]^ addr->ip6[2]^addr->ip6[3]; #endif return hash_32(ntohl(addr_fold), IP_VS_LBLC_TAB_BITS); } /* * Hash an entry in the ip_vs_lblc_table. * returns bool success. */ static void ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en) { unsigned int hash = ip_vs_lblc_hashkey(en->af, &en->addr); hlist_add_head_rcu(&en->list, &tbl->bucket[hash]); atomic_inc(&tbl->entries); } /* Get ip_vs_lblc_entry associated with supplied parameters. */ static inline struct ip_vs_lblc_entry * ip_vs_lblc_get(int af, struct ip_vs_lblc_table *tbl, const union nf_inet_addr *addr) { unsigned int hash = ip_vs_lblc_hashkey(af, addr); struct ip_vs_lblc_entry *en; hlist_for_each_entry_rcu(en, &tbl->bucket[hash], list) if (ip_vs_addr_equal(af, &en->addr, addr)) return en; return NULL; } /* * Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP * address to a server. Called under spin lock. */ static inline struct ip_vs_lblc_entry * ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr, u16 af, struct ip_vs_dest *dest) { struct ip_vs_lblc_entry *en; en = ip_vs_lblc_get(af, tbl, daddr); if (en) { if (en->dest == dest) return en; ip_vs_lblc_del(en); } en = kmalloc(sizeof(*en), GFP_ATOMIC); if (!en) return NULL; en->af = af; ip_vs_addr_copy(af, &en->addr, daddr); en->lastuse = jiffies; ip_vs_dest_hold(dest); en->dest = dest; ip_vs_lblc_hash(tbl, en); return en; } /* * Flush all the entries of the specified table. */ static void ip_vs_lblc_flush(struct ip_vs_service *svc) { struct ip_vs_lblc_table *tbl = svc->sched_data; struct ip_vs_lblc_entry *en; struct hlist_node *next; int i; spin_lock_bh(&svc->sched_lock); tbl->dead = true; for (i = 0; i < IP_VS_LBLC_TAB_SIZE; i++) { hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) { ip_vs_lblc_del(en); atomic_dec(&tbl->entries); } } spin_unlock_bh(&svc->sched_lock); } static int sysctl_lblc_expiration(struct ip_vs_service *svc) { #ifdef CONFIG_SYSCTL return svc->ipvs->sysctl_lblc_expiration; #else return DEFAULT_EXPIRATION; #endif } static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc) { struct ip_vs_lblc_table *tbl = svc->sched_data; struct ip_vs_lblc_entry *en; struct hlist_node *next; unsigned long now = jiffies; int i, j; for (i = 0, j = tbl->rover; i < IP_VS_LBLC_TAB_SIZE; i++) { j = (j + 1) & IP_VS_LBLC_TAB_MASK; spin_lock(&svc->sched_lock); hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) { if (time_before(now, en->lastuse + sysctl_lblc_expiration(svc))) continue; ip_vs_lblc_del(en); atomic_dec(&tbl->entries); } spin_unlock(&svc->sched_lock); } tbl->rover = j; } /* * Periodical timer handler for IPVS lblc table * It is used to collect stale entries when the number of entries * exceeds the maximum size of the table. * * Fixme: we probably need more complicated algorithm to collect * entries that have not been used for a long time even * if the number of entries doesn't exceed the maximum size * of the table. * The full expiration check is for this purpose now. */ static void ip_vs_lblc_check_expire(struct timer_list *t) { struct ip_vs_lblc_table *tbl = from_timer(tbl, t, periodic_timer); struct ip_vs_service *svc = tbl->svc; unsigned long now = jiffies; int goal; int i, j; struct ip_vs_lblc_entry *en; struct hlist_node *next; if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { /* do full expiration check */ ip_vs_lblc_full_check(svc); tbl->counter = 1; goto out; } if (atomic_read(&tbl->entries) <= tbl->max_size) { tbl->counter++; goto out; } goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3; if (goal > tbl->max_size/2) goal = tbl->max_size/2; for (i = 0, j = tbl->rover; i < IP_VS_LBLC_TAB_SIZE; i++) { j = (j + 1) & IP_VS_LBLC_TAB_MASK; spin_lock(&svc->sched_lock); hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) { if (time_before(now, en->lastuse + ENTRY_TIMEOUT)) continue; ip_vs_lblc_del(en); atomic_dec(&tbl->entries); goal--; } spin_unlock(&svc->sched_lock); if (goal <= 0) break; } tbl->rover = j; out: mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL); } static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) { int i; struct ip_vs_lblc_table *tbl; /* * Allocate the ip_vs_lblc_table for this service */ tbl = kmalloc(sizeof(*tbl), GFP_KERNEL); if (tbl == NULL) return -ENOMEM; svc->sched_data = tbl; IP_VS_DBG(6, "LBLC hash table (memory=%zdbytes) allocated for " "current service\n", sizeof(*tbl)); /* * Initialize the hash buckets */ for (i = 0; i < IP_VS_LBLC_TAB_SIZE; i++) { INIT_HLIST_HEAD(&tbl->bucket[i]); } tbl->max_size = IP_VS_LBLC_TAB_SIZE*16; tbl->rover = 0; tbl->counter = 1; tbl->dead = false; tbl->svc = svc; atomic_set(&tbl->entries, 0); /* * Hook periodic timer for garbage collection */ timer_setup(&tbl->periodic_timer, ip_vs_lblc_check_expire, 0); mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL); return 0; } static void ip_vs_lblc_done_svc(struct ip_vs_service *svc) { struct ip_vs_lblc_table *tbl = svc->sched_data; /* remove periodic timer */ timer_shutdown_sync(&tbl->periodic_timer); /* got to clean up table entries here */ ip_vs_lblc_flush(svc); /* release the table itself */ kfree_rcu(tbl, rcu_head); IP_VS_DBG(6, "LBLC hash table (memory=%zdbytes) released\n", sizeof(*tbl)); } static inline struct ip_vs_dest * __ip_vs_lblc_schedule(struct ip_vs_service *svc) { struct ip_vs_dest *dest, *least; int loh, doh; /* * We use the following formula to estimate the load: * (dest overhead) / dest->weight * * Remember -- no floats in kernel mode!!! * The comparison of h1*w2 > h2*w1 is equivalent to that of * h1/w1 > h2/w2 * if every weight is larger than zero. * * The server with weight=0 is quiesced and will not receive any * new connection. */ list_for_each_entry_rcu(dest, &svc->destinations, n_list) { if (dest->flags & IP_VS_DEST_F_OVERLOAD) continue; if (atomic_read(&dest->weight) > 0) { least = dest; loh = ip_vs_dest_conn_overhead(least); goto nextstage; } } return NULL; /* * Find the destination with the least load. */ nextstage: list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) { if (dest->flags & IP_VS_DEST_F_OVERLOAD) continue; doh = ip_vs_dest_conn_overhead(dest); if ((__s64)loh * atomic_read(&dest->weight) > (__s64)doh * atomic_read(&least->weight)) { least = dest; loh = doh; } } IP_VS_DBG_BUF(6, "LBLC: server %s:%d " "activeconns %d refcnt %d weight %d overhead %d\n", IP_VS_DBG_ADDR(least->af, &least->addr), ntohs(least->port), atomic_read(&least->activeconns), refcount_read(&least->refcnt), atomic_read(&least->weight), loh); return least; } /* * If this destination server is overloaded and there is a less loaded * server, then return true. */ static inline int is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) { if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { struct ip_vs_dest *d; list_for_each_entry_rcu(d, &svc->destinations, n_list) { if (atomic_read(&d->activeconns)*2 < atomic_read(&d->weight)) { return 1; } } } return 0; } /* * Locality-Based (weighted) Least-Connection scheduling */ static struct ip_vs_dest * ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, struct ip_vs_iphdr *iph) { struct ip_vs_lblc_table *tbl = svc->sched_data; struct ip_vs_dest *dest = NULL; struct ip_vs_lblc_entry *en; IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); /* First look in our cache */ en = ip_vs_lblc_get(svc->af, tbl, &iph->daddr); if (en) { /* We only hold a read lock, but this is atomic */ en->lastuse = jiffies; /* * If the destination is not available, i.e. it's in the trash, * we must ignore it, as it may be removed from under our feet, * if someone drops our reference count. Our caller only makes * sure that destinations, that are not in the trash, are not * moved to the trash, while we are scheduling. But anyone can * free up entries from the trash at any time. */ dest = en->dest; if ((dest->flags & IP_VS_DEST_F_AVAILABLE) && atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc)) goto out; } /* No cache entry or it is invalid, time to schedule */ dest = __ip_vs_lblc_schedule(svc); if (!dest) { ip_vs_scheduler_err(svc, "no destination available"); return NULL; } /* If we fail to create a cache entry, we'll just use the valid dest */ spin_lock_bh(&svc->sched_lock); if (!tbl->dead) ip_vs_lblc_new(tbl, &iph->daddr, svc->af, dest); spin_unlock_bh(&svc->sched_lock); out: IP_VS_DBG_BUF(6, "LBLC: destination IP address %s --> server %s:%d\n", IP_VS_DBG_ADDR(svc->af, &iph->daddr), IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port)); return dest; } /* * IPVS LBLC Scheduler structure */ static struct ip_vs_scheduler ip_vs_lblc_scheduler = { .name = "lblc", .refcnt = ATOMIC_INIT(0), .module = THIS_MODULE, .n_list = LIST_HEAD_INIT(ip_vs_lblc_scheduler.n_list), .init_service = ip_vs_lblc_init_svc, .done_service = ip_vs_lblc_done_svc, .schedule = ip_vs_lblc_schedule, }; /* * per netns init. */ #ifdef CONFIG_SYSCTL static int __net_init __ip_vs_lblc_init(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); size_t vars_table_size = ARRAY_SIZE(vs_vars_table); if (!ipvs) return -ENOENT; if (!net_eq(net, &init_net)) { ipvs->lblc_ctl_table = kmemdup(vs_vars_table, sizeof(vs_vars_table), GFP_KERNEL); if (ipvs->lblc_ctl_table == NULL) return -ENOMEM; /* Don't export sysctls to unprivileged users */ if (net->user_ns != &init_user_ns) { ipvs->lblc_ctl_table[0].procname = NULL; vars_table_size = 0; } } else ipvs->lblc_ctl_table = vs_vars_table; ipvs->sysctl_lblc_expiration = DEFAULT_EXPIRATION; ipvs->lblc_ctl_table[0].data = &ipvs->sysctl_lblc_expiration; ipvs->lblc_ctl_header = register_net_sysctl_sz(net, "net/ipv4/vs", ipvs->lblc_ctl_table, vars_table_size); if (!ipvs->lblc_ctl_header) { if (!net_eq(net, &init_net)) kfree(ipvs->lblc_ctl_table); return -ENOMEM; } return 0; } static void __net_exit __ip_vs_lblc_exit(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); unregister_net_sysctl_table(ipvs->lblc_ctl_header); if (!net_eq(net, &init_net)) kfree(ipvs->lblc_ctl_table); } #else static int __net_init __ip_vs_lblc_init(struct net *net) { return 0; } static void __net_exit __ip_vs_lblc_exit(struct net *net) { } #endif static struct pernet_operations ip_vs_lblc_ops = { .init = __ip_vs_lblc_init, .exit = __ip_vs_lblc_exit, }; static int __init ip_vs_lblc_init(void) { int ret; ret = register_pernet_subsys(&ip_vs_lblc_ops); if (ret) return ret; ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler); if (ret) unregister_pernet_subsys(&ip_vs_lblc_ops); return ret; } static void __exit ip_vs_lblc_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler); unregister_pernet_subsys(&ip_vs_lblc_ops); rcu_barrier(); } module_init(ip_vs_lblc_init); module_exit(ip_vs_lblc_cleanup); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("ipvs locality-based least-connection scheduler"); |
25 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_BITMAP_H #define __LINUX_BITMAP_H #ifndef __ASSEMBLY__ #include <linux/align.h> #include <linux/bitops.h> #include <linux/errno.h> #include <linux/find.h> #include <linux/limits.h> #include <linux/string.h> #include <linux/types.h> #include <linux/bitmap-str.h> struct device; /* * bitmaps provide bit arrays that consume one or more unsigned * longs. The bitmap interface and available operations are listed * here, in bitmap.h * * Function implementations generic to all architectures are in * lib/bitmap.c. Functions implementations that are architecture * specific are in various include/asm-<arch>/bitops.h headers * and other arch/<arch> specific files. * * See lib/bitmap.c for more details. */ /** * DOC: bitmap overview * * The available bitmap operations and their rough meaning in the * case that the bitmap is a single unsigned long are thus: * * The generated code is more efficient when nbits is known at * compile-time and at most BITS_PER_LONG. * * :: * * bitmap_zero(dst, nbits) *dst = 0UL * bitmap_fill(dst, nbits) *dst = ~0UL * bitmap_copy(dst, src, nbits) *dst = *src * bitmap_and(dst, src1, src2, nbits) *dst = *src1 & *src2 * bitmap_or(dst, src1, src2, nbits) *dst = *src1 | *src2 * bitmap_xor(dst, src1, src2, nbits) *dst = *src1 ^ *src2 * bitmap_andnot(dst, src1, src2, nbits) *dst = *src1 & ~(*src2) * bitmap_complement(dst, src, nbits) *dst = ~(*src) * bitmap_equal(src1, src2, nbits) Are *src1 and *src2 equal? * bitmap_intersects(src1, src2, nbits) Do *src1 and *src2 overlap? * bitmap_subset(src1, src2, nbits) Is *src1 a subset of *src2? * bitmap_empty(src, nbits) Are all bits zero in *src? * bitmap_full(src, nbits) Are all bits set in *src? * bitmap_weight(src, nbits) Hamming Weight: number set bits * bitmap_weight_and(src1, src2, nbits) Hamming Weight of and'ed bitmap * bitmap_set(dst, pos, nbits) Set specified bit area * bitmap_clear(dst, pos, nbits) Clear specified bit area * bitmap_find_next_zero_area(buf, len, pos, n, mask) Find bit free area * bitmap_find_next_zero_area_off(buf, len, pos, n, mask, mask_off) as above * bitmap_shift_right(dst, src, n, nbits) *dst = *src >> n * bitmap_shift_left(dst, src, n, nbits) *dst = *src << n * bitmap_cut(dst, src, first, n, nbits) Cut n bits from first, copy rest * bitmap_replace(dst, old, new, mask, nbits) *dst = (*old & ~(*mask)) | (*new & *mask) * bitmap_remap(dst, src, old, new, nbits) *dst = map(old, new)(src) * bitmap_bitremap(oldbit, old, new, nbits) newbit = map(old, new)(oldbit) * bitmap_onto(dst, orig, relmap, nbits) *dst = orig relative to relmap * bitmap_fold(dst, orig, sz, nbits) dst bits = orig bits mod sz * bitmap_parse(buf, buflen, dst, nbits) Parse bitmap dst from kernel buf * bitmap_parse_user(ubuf, ulen, dst, nbits) Parse bitmap dst from user buf * bitmap_parselist(buf, dst, nbits) Parse bitmap dst from kernel buf * bitmap_parselist_user(buf, dst, nbits) Parse bitmap dst from user buf * bitmap_find_free_region(bitmap, bits, order) Find and allocate bit region * bitmap_release_region(bitmap, pos, order) Free specified bit region * bitmap_allocate_region(bitmap, pos, order) Allocate specified bit region * bitmap_from_arr32(dst, buf, nbits) Copy nbits from u32[] buf to dst * bitmap_from_arr64(dst, buf, nbits) Copy nbits from u64[] buf to dst * bitmap_to_arr32(buf, src, nbits) Copy nbits from buf to u32[] dst * bitmap_to_arr64(buf, src, nbits) Copy nbits from buf to u64[] dst * bitmap_get_value8(map, start) Get 8bit value from map at start * bitmap_set_value8(map, value, start) Set 8bit value to map at start * * Note, bitmap_zero() and bitmap_fill() operate over the region of * unsigned longs, that is, bits behind bitmap till the unsigned long * boundary will be zeroed or filled as well. Consider to use * bitmap_clear() or bitmap_set() to make explicit zeroing or filling * respectively. */ /** * DOC: bitmap bitops * * Also the following operations in asm/bitops.h apply to bitmaps.:: * * set_bit(bit, addr) *addr |= bit * clear_bit(bit, addr) *addr &= ~bit * change_bit(bit, addr) *addr ^= bit * test_bit(bit, addr) Is bit set in *addr? * test_and_set_bit(bit, addr) Set bit and return old value * test_and_clear_bit(bit, addr) Clear bit and return old value * test_and_change_bit(bit, addr) Change bit and return old value * find_first_zero_bit(addr, nbits) Position first zero bit in *addr * find_first_bit(addr, nbits) Position first set bit in *addr * find_next_zero_bit(addr, nbits, bit) * Position next zero bit in *addr >= bit * find_next_bit(addr, nbits, bit) Position next set bit in *addr >= bit * find_next_and_bit(addr1, addr2, nbits, bit) * Same as find_next_bit, but in * (*addr1 & *addr2) * */ /** * DOC: declare bitmap * The DECLARE_BITMAP(name,bits) macro, in linux/types.h, can be used * to declare an array named 'name' of just enough unsigned longs to * contain all bit positions from 0 to 'bits' - 1. */ /* * Allocation and deallocation of bitmap. * Provided in lib/bitmap.c to avoid circular dependency. */ unsigned long *bitmap_alloc(unsigned int nbits, gfp_t flags); unsigned long *bitmap_zalloc(unsigned int nbits, gfp_t flags); unsigned long *bitmap_alloc_node(unsigned int nbits, gfp_t flags, int node); unsigned long *bitmap_zalloc_node(unsigned int nbits, gfp_t flags, int node); void bitmap_free(const unsigned long *bitmap); /* Managed variants of the above. */ unsigned long *devm_bitmap_alloc(struct device *dev, unsigned int nbits, gfp_t flags); unsigned long *devm_bitmap_zalloc(struct device *dev, unsigned int nbits, gfp_t flags); /* * lib/bitmap.c provides these functions: */ bool __bitmap_equal(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); bool __pure __bitmap_or_equal(const unsigned long *src1, const unsigned long *src2, const unsigned long *src3, unsigned int nbits); void __bitmap_complement(unsigned long *dst, const unsigned long *src, unsigned int nbits); void __bitmap_shift_right(unsigned long *dst, const unsigned long *src, unsigned int shift, unsigned int nbits); void __bitmap_shift_left(unsigned long *dst, const unsigned long *src, unsigned int shift, unsigned int nbits); void bitmap_cut(unsigned long *dst, const unsigned long *src, unsigned int first, unsigned int cut, unsigned int nbits); bool __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); bool __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); void __bitmap_replace(unsigned long *dst, const unsigned long *old, const unsigned long *new, const unsigned long *mask, unsigned int nbits); bool __bitmap_intersects(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); bool __bitmap_subset(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); unsigned int __bitmap_weight(const unsigned long *bitmap, unsigned int nbits); unsigned int __bitmap_weight_and(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); void __bitmap_set(unsigned long *map, unsigned int start, int len); void __bitmap_clear(unsigned long *map, unsigned int start, int len); unsigned long bitmap_find_next_zero_area_off(unsigned long *map, unsigned long size, unsigned long start, unsigned int nr, unsigned long align_mask, unsigned long align_offset); /** * bitmap_find_next_zero_area - find a contiguous aligned zero area * @map: The address to base the search on * @size: The bitmap size in bits * @start: The bitnumber to start searching at * @nr: The number of zeroed bits we're looking for * @align_mask: Alignment mask for zero area * * The @align_mask should be one less than a power of 2; the effect is that * the bit offset of all zero areas this function finds is multiples of that * power of 2. A @align_mask of 0 means no alignment is required. */ static inline unsigned long bitmap_find_next_zero_area(unsigned long *map, unsigned long size, unsigned long start, unsigned int nr, unsigned long align_mask) { return bitmap_find_next_zero_area_off(map, size, start, nr, align_mask, 0); } void bitmap_remap(unsigned long *dst, const unsigned long *src, const unsigned long *old, const unsigned long *new, unsigned int nbits); int bitmap_bitremap(int oldbit, const unsigned long *old, const unsigned long *new, int bits); void bitmap_onto(unsigned long *dst, const unsigned long *orig, const unsigned long *relmap, unsigned int bits); void bitmap_fold(unsigned long *dst, const unsigned long *orig, unsigned int sz, unsigned int nbits); #define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1))) #define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1))) static inline void bitmap_zero(unsigned long *dst, unsigned int nbits) { unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long); if (small_const_nbits(nbits)) *dst = 0; else memset(dst, 0, len); } static inline void bitmap_fill(unsigned long *dst, unsigned int nbits) { unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long); if (small_const_nbits(nbits)) *dst = ~0UL; else memset(dst, 0xff, len); } static inline void bitmap_copy(unsigned long *dst, const unsigned long *src, unsigned int nbits) { unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long); if (small_const_nbits(nbits)) *dst = *src; else memcpy(dst, src, len); } /* * Copy bitmap and clear tail bits in last word. */ static inline void bitmap_copy_clear_tail(unsigned long *dst, const unsigned long *src, unsigned int nbits) { bitmap_copy(dst, src, nbits); if (nbits % BITS_PER_LONG) dst[nbits / BITS_PER_LONG] &= BITMAP_LAST_WORD_MASK(nbits); } /* * On 32-bit systems bitmaps are represented as u32 arrays internally. On LE64 * machines the order of hi and lo parts of numbers match the bitmap structure. * In both cases conversion is not needed when copying data from/to arrays of * u32. But in LE64 case, typecast in bitmap_copy_clear_tail() may lead * to out-of-bound access. To avoid that, both LE and BE variants of 64-bit * architectures are not using bitmap_copy_clear_tail(). */ #if BITS_PER_LONG == 64 void bitmap_from_arr32(unsigned long *bitmap, const u32 *buf, unsigned int nbits); void bitmap_to_arr32(u32 *buf, const unsigned long *bitmap, unsigned int nbits); #else #define bitmap_from_arr32(bitmap, buf, nbits) \ bitmap_copy_clear_tail((unsigned long *) (bitmap), \ (const unsigned long *) (buf), (nbits)) #define bitmap_to_arr32(buf, bitmap, nbits) \ bitmap_copy_clear_tail((unsigned long *) (buf), \ (const unsigned long *) (bitmap), (nbits)) #endif /* * On 64-bit systems bitmaps are represented as u64 arrays internally. So, * the conversion is not needed when copying data from/to arrays of u64. */ #if BITS_PER_LONG == 32 void bitmap_from_arr64(unsigned long *bitmap, const u64 *buf, unsigned int nbits); void bitmap_to_arr64(u64 *buf, const unsigned long *bitmap, unsigned int nbits); #else #define bitmap_from_arr64(bitmap, buf, nbits) \ bitmap_copy_clear_tail((unsigned long *)(bitmap), (const unsigned long *)(buf), (nbits)) #define bitmap_to_arr64(buf, bitmap, nbits) \ bitmap_copy_clear_tail((unsigned long *)(buf), (const unsigned long *)(bitmap), (nbits)) #endif static inline bool bitmap_and(unsigned long *dst, const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) return (*dst = *src1 & *src2 & BITMAP_LAST_WORD_MASK(nbits)) != 0; return __bitmap_and(dst, src1, src2, nbits); } static inline void bitmap_or(unsigned long *dst, const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) *dst = *src1 | *src2; else __bitmap_or(dst, src1, src2, nbits); } static inline void bitmap_xor(unsigned long *dst, const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) *dst = *src1 ^ *src2; else __bitmap_xor(dst, src1, src2, nbits); } static inline bool bitmap_andnot(unsigned long *dst, const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) return (*dst = *src1 & ~(*src2) & BITMAP_LAST_WORD_MASK(nbits)) != 0; return __bitmap_andnot(dst, src1, src2, nbits); } static inline void bitmap_complement(unsigned long *dst, const unsigned long *src, unsigned int nbits) { if (small_const_nbits(nbits)) *dst = ~(*src); else __bitmap_complement(dst, src, nbits); } #ifdef __LITTLE_ENDIAN #define BITMAP_MEM_ALIGNMENT 8 #else #define BITMAP_MEM_ALIGNMENT (8 * sizeof(unsigned long)) #endif #define BITMAP_MEM_MASK (BITMAP_MEM_ALIGNMENT - 1) static inline bool bitmap_equal(const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) return !((*src1 ^ *src2) & BITMAP_LAST_WORD_MASK(nbits)); if (__builtin_constant_p(nbits & BITMAP_MEM_MASK) && IS_ALIGNED(nbits, BITMAP_MEM_ALIGNMENT)) return !memcmp(src1, src2, nbits / 8); return __bitmap_equal(src1, src2, nbits); } /** * bitmap_or_equal - Check whether the or of two bitmaps is equal to a third * @src1: Pointer to bitmap 1 * @src2: Pointer to bitmap 2 will be or'ed with bitmap 1 * @src3: Pointer to bitmap 3. Compare to the result of *@src1 | *@src2 * @nbits: number of bits in each of these bitmaps * * Returns: True if (*@src1 | *@src2) == *@src3, false otherwise */ static inline bool bitmap_or_equal(const unsigned long *src1, const unsigned long *src2, const unsigned long *src3, unsigned int nbits) { if (!small_const_nbits(nbits)) return __bitmap_or_equal(src1, src2, src3, nbits); return !(((*src1 | *src2) ^ *src3) & BITMAP_LAST_WORD_MASK(nbits)); } static inline bool bitmap_intersects(const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) return ((*src1 & *src2) & BITMAP_LAST_WORD_MASK(nbits)) != 0; else return __bitmap_intersects(src1, src2, nbits); } static inline bool bitmap_subset(const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) return ! ((*src1 & ~(*src2)) & BITMAP_LAST_WORD_MASK(nbits)); else return __bitmap_subset(src1, src2, nbits); } static inline bool bitmap_empty(const unsigned long *src, unsigned nbits) { if (small_const_nbits(nbits)) return ! (*src & BITMAP_LAST_WORD_MASK(nbits)); return find_first_bit(src, nbits) == nbits; } static inline bool bitmap_full(const unsigned long *src, unsigned int nbits) { if (small_const_nbits(nbits)) return ! (~(*src) & BITMAP_LAST_WORD_MASK(nbits)); return find_first_zero_bit(src, nbits) == nbits; } static __always_inline unsigned int bitmap_weight(const unsigned long *src, unsigned int nbits) { if (small_const_nbits(nbits)) return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits)); return __bitmap_weight(src, nbits); } static __always_inline unsigned long bitmap_weight_and(const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) return hweight_long(*src1 & *src2 & BITMAP_LAST_WORD_MASK(nbits)); return __bitmap_weight_and(src1, src2, nbits); } static __always_inline void bitmap_set(unsigned long *map, unsigned int start, unsigned int nbits) { if (__builtin_constant_p(nbits) && nbits == 1) __set_bit(start, map); else if (small_const_nbits(start + nbits)) *map |= GENMASK(start + nbits - 1, start); else if (__builtin_constant_p(start & BITMAP_MEM_MASK) && IS_ALIGNED(start, BITMAP_MEM_ALIGNMENT) && __builtin_constant_p(nbits & BITMAP_MEM_MASK) && IS_ALIGNED(nbits, BITMAP_MEM_ALIGNMENT)) memset((char *)map + start / 8, 0xff, nbits / 8); else __bitmap_set(map, start, nbits); } static __always_inline void bitmap_clear(unsigned long *map, unsigned int start, unsigned int nbits) { if (__builtin_constant_p(nbits) && nbits == 1) __clear_bit(start, map); else if (small_const_nbits(start + nbits)) *map &= ~GENMASK(start + nbits - 1, start); else if (__builtin_constant_p(start & BITMAP_MEM_MASK) && IS_ALIGNED(start, BITMAP_MEM_ALIGNMENT) && __builtin_constant_p(nbits & BITMAP_MEM_MASK) && IS_ALIGNED(nbits, BITMAP_MEM_ALIGNMENT)) memset((char *)map + start / 8, 0, nbits / 8); else __bitmap_clear(map, start, nbits); } static inline void bitmap_shift_right(unsigned long *dst, const unsigned long *src, unsigned int shift, unsigned int nbits) { if (small_const_nbits(nbits)) *dst = (*src & BITMAP_LAST_WORD_MASK(nbits)) >> shift; else __bitmap_shift_right(dst, src, shift, nbits); } static inline void bitmap_shift_left(unsigned long *dst, const unsigned long *src, unsigned int shift, unsigned int nbits) { if (small_const_nbits(nbits)) *dst = (*src << shift) & BITMAP_LAST_WORD_MASK(nbits); else __bitmap_shift_left(dst, src, shift, nbits); } static inline void bitmap_replace(unsigned long *dst, const unsigned long *old, const unsigned long *new, const unsigned long *mask, unsigned int nbits) { if (small_const_nbits(nbits)) *dst = (*old & ~(*mask)) | (*new & *mask); else __bitmap_replace(dst, old, new, mask, nbits); } static inline void bitmap_next_set_region(unsigned long *bitmap, unsigned int *rs, unsigned int *re, unsigned int end) { *rs = find_next_bit(bitmap, end, *rs); *re = find_next_zero_bit(bitmap, end, *rs + 1); } /** * bitmap_release_region - release allocated bitmap region * @bitmap: array of unsigned longs corresponding to the bitmap * @pos: beginning of bit region to release * @order: region size (log base 2 of number of bits) to release * * This is the complement to __bitmap_find_free_region() and releases * the found region (by clearing it in the bitmap). */ static inline void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order) { bitmap_clear(bitmap, pos, BIT(order)); } /** * bitmap_allocate_region - allocate bitmap region * @bitmap: array of unsigned longs corresponding to the bitmap * @pos: beginning of bit region to allocate * @order: region size (log base 2 of number of bits) to allocate * * Allocate (set bits in) a specified region of a bitmap. * * Returns: 0 on success, or %-EBUSY if specified region wasn't * free (not all bits were zero). */ static inline int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order) { unsigned int len = BIT(order); if (find_next_bit(bitmap, pos + len, pos) < pos + len) return -EBUSY; bitmap_set(bitmap, pos, len); return 0; } /** * bitmap_find_free_region - find a contiguous aligned mem region * @bitmap: array of unsigned longs corresponding to the bitmap * @bits: number of bits in the bitmap * @order: region size (log base 2 of number of bits) to find * * Find a region of free (zero) bits in a @bitmap of @bits bits and * allocate them (set them to one). Only consider regions of length * a power (@order) of two, aligned to that power of two, which * makes the search algorithm much faster. * * Returns: the bit offset in bitmap of the allocated region, * or -errno on failure. */ static inline int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order) { unsigned int pos, end; /* scans bitmap by regions of size order */ for (pos = 0; (end = pos + BIT(order)) <= bits; pos = end) { if (!bitmap_allocate_region(bitmap, pos, order)) return pos; } return -ENOMEM; } /** * BITMAP_FROM_U64() - Represent u64 value in the format suitable for bitmap. * @n: u64 value * * Linux bitmaps are internally arrays of unsigned longs, i.e. 32-bit * integers in 32-bit environment, and 64-bit integers in 64-bit one. * * There are four combinations of endianness and length of the word in linux * ABIs: LE64, BE64, LE32 and BE32. * * On 64-bit kernels 64-bit LE and BE numbers are naturally ordered in * bitmaps and therefore don't require any special handling. * * On 32-bit kernels 32-bit LE ABI orders lo word of 64-bit number in memory * prior to hi, and 32-bit BE orders hi word prior to lo. The bitmap on the * other hand is represented as an array of 32-bit words and the position of * bit N may therefore be calculated as: word #(N/32) and bit #(N%32) in that * word. For example, bit #42 is located at 10th position of 2nd word. * It matches 32-bit LE ABI, and we can simply let the compiler store 64-bit * values in memory as it usually does. But for BE we need to swap hi and lo * words manually. * * With all that, the macro BITMAP_FROM_U64() does explicit reordering of hi and * lo parts of u64. For LE32 it does nothing, and for BE environment it swaps * hi and lo words, as is expected by bitmap. */ #if __BITS_PER_LONG == 64 #define BITMAP_FROM_U64(n) (n) #else #define BITMAP_FROM_U64(n) ((unsigned long) ((u64)(n) & ULONG_MAX)), \ ((unsigned long) ((u64)(n) >> 32)) #endif /** * bitmap_from_u64 - Check and swap words within u64. * @mask: source bitmap * @dst: destination bitmap * * In 32-bit Big Endian kernel, when using ``(u32 *)(&val)[*]`` * to read u64 mask, we will get the wrong word. * That is ``(u32 *)(&val)[0]`` gets the upper 32 bits, * but we expect the lower 32-bits of u64. */ static inline void bitmap_from_u64(unsigned long *dst, u64 mask) { bitmap_from_arr64(dst, &mask, 64); } /** * bitmap_get_value8 - get an 8-bit value within a memory region * @map: address to the bitmap memory region * @start: bit offset of the 8-bit value; must be a multiple of 8 * * Returns the 8-bit value located at the @start bit offset within the @src * memory region. */ static inline unsigned long bitmap_get_value8(const unsigned long *map, unsigned long start) { const size_t index = BIT_WORD(start); const unsigned long offset = start % BITS_PER_LONG; return (map[index] >> offset) & 0xFF; } /** * bitmap_set_value8 - set an 8-bit value within a memory region * @map: address to the bitmap memory region * @value: the 8-bit value; values wider than 8 bits may clobber bitmap * @start: bit offset of the 8-bit value; must be a multiple of 8 */ static inline void bitmap_set_value8(unsigned long *map, unsigned long value, unsigned long start) { const size_t index = BIT_WORD(start); const unsigned long offset = start % BITS_PER_LONG; map[index] &= ~(0xFFUL << offset); map[index] |= value << offset; } #endif /* __ASSEMBLY__ */ #endif /* __LINUX_BITMAP_H */ |
3968 3968 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Derived from arch/ppc/mm/extable.c and arch/i386/mm/extable.c. * * Copyright (C) 2004 Paul Mackerras, IBM Corp. */ #include <linux/bsearch.h> #include <linux/module.h> #include <linux/init.h> #include <linux/sort.h> #include <linux/uaccess.h> #include <linux/extable.h> #ifndef ARCH_HAS_RELATIVE_EXTABLE #define ex_to_insn(x) ((x)->insn) #else static inline unsigned long ex_to_insn(const struct exception_table_entry *x) { return (unsigned long)&x->insn + x->insn; } #endif #ifndef ARCH_HAS_RELATIVE_EXTABLE #define swap_ex NULL #else static void swap_ex(void *a, void *b, int size) { struct exception_table_entry *x = a, *y = b, tmp; int delta = b - a; tmp = *x; x->insn = y->insn + delta; y->insn = tmp.insn - delta; #ifdef swap_ex_entry_fixup swap_ex_entry_fixup(x, y, tmp, delta); #else x->fixup = y->fixup + delta; y->fixup = tmp.fixup - delta; #endif } #endif /* ARCH_HAS_RELATIVE_EXTABLE */ /* * The exception table needs to be sorted so that the binary * search that we use to find entries in it works properly. * This is used both for the kernel exception table and for * the exception tables of modules that get loaded. */ static int cmp_ex_sort(const void *a, const void *b) { const struct exception_table_entry *x = a, *y = b; /* avoid overflow */ if (ex_to_insn(x) > ex_to_insn(y)) return 1; if (ex_to_insn(x) < ex_to_insn(y)) return -1; return 0; } void sort_extable(struct exception_table_entry *start, struct exception_table_entry *finish) { sort(start, finish - start, sizeof(struct exception_table_entry), cmp_ex_sort, swap_ex); } #ifdef CONFIG_MODULES /* * If the exception table is sorted, any referring to the module init * will be at the beginning or the end. */ void trim_init_extable(struct module *m) { /*trim the beginning*/ while (m->num_exentries && within_module_init(ex_to_insn(&m->extable[0]), m)) { m->extable++; m->num_exentries--; } /*trim the end*/ while (m->num_exentries && within_module_init(ex_to_insn(&m->extable[m->num_exentries - 1]), m)) m->num_exentries--; } #endif /* CONFIG_MODULES */ static int cmp_ex_search(const void *key, const void *elt) { const struct exception_table_entry *_elt = elt; unsigned long _key = *(unsigned long *)key; /* avoid overflow */ if (_key > ex_to_insn(_elt)) return 1; if (_key < ex_to_insn(_elt)) return -1; return 0; } /* * Search one exception table for an entry corresponding to the * given instruction address, and return the address of the entry, * or NULL if none is found. * We use a binary search, and thus we assume that the table is * already sorted. */ const struct exception_table_entry * search_extable(const struct exception_table_entry *base, const size_t num, unsigned long value) { return bsearch(&value, base, num, sizeof(struct exception_table_entry), cmp_ex_search); } |
56 10 46 8 8 8 8 1 2 1 4 4 1 2 2 8 8 1 7 2 2 2 2 1 1 9 1 1 3 1 1 2 3 2 4 2 10 10 4 5 1 7 7 4 3 4 1 4 1 4 1 4 8 8 7 2 5 65 8 57 1 4 8 2 2 9 10 7 7 4 9 24 2 19 18 9 9 51 2 3 8 27 6 5 38 21 4 8 32 6 56 3 1 9 44 12 40 12 41 9 42 54 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 | // SPDX-License-Identifier: GPL-2.0 /* * Quota code necessary even when VFS quota support is not compiled * into the kernel. The interesting stuff is over in dquot.c, here * we have symbols for initial quotactl(2) handling, the sysctl(2) * variables, etc - things needed even when quota support disabled. */ #include <linux/fs.h> #include <linux/namei.h> #include <linux/slab.h> #include <asm/current.h> #include <linux/blkdev.h> #include <linux/uaccess.h> #include <linux/kernel.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/capability.h> #include <linux/quotaops.h> #include <linux/types.h> #include <linux/mount.h> #include <linux/writeback.h> #include <linux/nospec.h> #include "compat.h" #include "../internal.h" static int check_quotactl_permission(struct super_block *sb, int type, int cmd, qid_t id) { switch (cmd) { /* these commands do not require any special privilegues */ case Q_GETFMT: case Q_SYNC: case Q_GETINFO: case Q_XGETQSTAT: case Q_XGETQSTATV: case Q_XQUOTASYNC: break; /* allow to query information for dquots we "own" */ case Q_GETQUOTA: case Q_XGETQUOTA: if ((type == USRQUOTA && uid_eq(current_euid(), make_kuid(current_user_ns(), id))) || (type == GRPQUOTA && in_egroup_p(make_kgid(current_user_ns(), id)))) break; fallthrough; default: if (!capable(CAP_SYS_ADMIN)) return -EPERM; } return security_quotactl(cmd, type, id, sb); } static void quota_sync_one(struct super_block *sb, void *arg) { int type = *(int *)arg; if (sb->s_qcop && sb->s_qcop->quota_sync && (sb->s_quota_types & (1 << type))) sb->s_qcop->quota_sync(sb, type); } static int quota_sync_all(int type) { int ret; ret = security_quotactl(Q_SYNC, type, 0, NULL); if (!ret) iterate_supers(quota_sync_one, &type); return ret; } unsigned int qtype_enforce_flag(int type) { switch (type) { case USRQUOTA: return FS_QUOTA_UDQ_ENFD; case GRPQUOTA: return FS_QUOTA_GDQ_ENFD; case PRJQUOTA: return FS_QUOTA_PDQ_ENFD; } return 0; } static int quota_quotaon(struct super_block *sb, int type, qid_t id, const struct path *path) { if (!sb->s_qcop->quota_on && !sb->s_qcop->quota_enable) return -ENOSYS; if (sb->s_qcop->quota_enable) return sb->s_qcop->quota_enable(sb, qtype_enforce_flag(type)); if (IS_ERR(path)) return PTR_ERR(path); return sb->s_qcop->quota_on(sb, type, id, path); } static int quota_quotaoff(struct super_block *sb, int type) { if (!sb->s_qcop->quota_off && !sb->s_qcop->quota_disable) return -ENOSYS; if (sb->s_qcop->quota_disable) return sb->s_qcop->quota_disable(sb, qtype_enforce_flag(type)); return sb->s_qcop->quota_off(sb, type); } static int quota_getfmt(struct super_block *sb, int type, void __user *addr) { __u32 fmt; if (!sb_has_quota_active(sb, type)) return -ESRCH; fmt = sb_dqopt(sb)->info[type].dqi_format->qf_fmt_id; if (copy_to_user(addr, &fmt, sizeof(fmt))) return -EFAULT; return 0; } static int quota_getinfo(struct super_block *sb, int type, void __user *addr) { struct qc_state state; struct qc_type_state *tstate; struct if_dqinfo uinfo; int ret; if (!sb->s_qcop->get_state) return -ENOSYS; ret = sb->s_qcop->get_state(sb, &state); if (ret) return ret; tstate = state.s_state + type; if (!(tstate->flags & QCI_ACCT_ENABLED)) return -ESRCH; memset(&uinfo, 0, sizeof(uinfo)); uinfo.dqi_bgrace = tstate->spc_timelimit; uinfo.dqi_igrace = tstate->ino_timelimit; if (tstate->flags & QCI_SYSFILE) uinfo.dqi_flags |= DQF_SYS_FILE; if (tstate->flags & QCI_ROOT_SQUASH) uinfo.dqi_flags |= DQF_ROOT_SQUASH; uinfo.dqi_valid = IIF_ALL; if (copy_to_user(addr, &uinfo, sizeof(uinfo))) return -EFAULT; return 0; } static int quota_setinfo(struct super_block *sb, int type, void __user *addr) { struct if_dqinfo info; struct qc_info qinfo; if (copy_from_user(&info, addr, sizeof(info))) return -EFAULT; if (!sb->s_qcop->set_info) return -ENOSYS; if (info.dqi_valid & ~(IIF_FLAGS | IIF_BGRACE | IIF_IGRACE)) return -EINVAL; memset(&qinfo, 0, sizeof(qinfo)); if (info.dqi_valid & IIF_FLAGS) { if (info.dqi_flags & ~DQF_SETINFO_MASK) return -EINVAL; if (info.dqi_flags & DQF_ROOT_SQUASH) qinfo.i_flags |= QCI_ROOT_SQUASH; qinfo.i_fieldmask |= QC_FLAGS; } if (info.dqi_valid & IIF_BGRACE) { qinfo.i_spc_timelimit = info.dqi_bgrace; qinfo.i_fieldmask |= QC_SPC_TIMER; } if (info.dqi_valid & IIF_IGRACE) { qinfo.i_ino_timelimit = info.dqi_igrace; qinfo.i_fieldmask |= QC_INO_TIMER; } return sb->s_qcop->set_info(sb, type, &qinfo); } static inline qsize_t qbtos(qsize_t blocks) { return blocks << QIF_DQBLKSIZE_BITS; } static inline qsize_t stoqb(qsize_t space) { return (space + QIF_DQBLKSIZE - 1) >> QIF_DQBLKSIZE_BITS; } static void copy_to_if_dqblk(struct if_dqblk *dst, struct qc_dqblk *src) { memset(dst, 0, sizeof(*dst)); dst->dqb_bhardlimit = stoqb(src->d_spc_hardlimit); dst->dqb_bsoftlimit = stoqb(src->d_spc_softlimit); dst->dqb_curspace = src->d_space; dst->dqb_ihardlimit = src->d_ino_hardlimit; dst->dqb_isoftlimit = src->d_ino_softlimit; dst->dqb_curinodes = src->d_ino_count; dst->dqb_btime = src->d_spc_timer; dst->dqb_itime = src->d_ino_timer; dst->dqb_valid = QIF_ALL; } static int quota_getquota(struct super_block *sb, int type, qid_t id, void __user *addr) { struct kqid qid; struct qc_dqblk fdq; struct if_dqblk idq; int ret; if (!sb->s_qcop->get_dqblk) return -ENOSYS; qid = make_kqid(current_user_ns(), type, id); if (!qid_has_mapping(sb->s_user_ns, qid)) return -EINVAL; ret = sb->s_qcop->get_dqblk(sb, qid, &fdq); if (ret) return ret; copy_to_if_dqblk(&idq, &fdq); if (compat_need_64bit_alignment_fixup()) { struct compat_if_dqblk __user *compat_dqblk = addr; if (copy_to_user(compat_dqblk, &idq, sizeof(*compat_dqblk))) return -EFAULT; if (put_user(idq.dqb_valid, &compat_dqblk->dqb_valid)) return -EFAULT; } else { if (copy_to_user(addr, &idq, sizeof(idq))) return -EFAULT; } return 0; } /* * Return quota for next active quota >= this id, if any exists, * otherwise return -ENOENT via ->get_nextdqblk */ static int quota_getnextquota(struct super_block *sb, int type, qid_t id, void __user *addr) { struct kqid qid; struct qc_dqblk fdq; struct if_nextdqblk idq; int ret; if (!sb->s_qcop->get_nextdqblk) return -ENOSYS; qid = make_kqid(current_user_ns(), type, id); if (!qid_has_mapping(sb->s_user_ns, qid)) return -EINVAL; ret = sb->s_qcop->get_nextdqblk(sb, &qid, &fdq); if (ret) return ret; /* struct if_nextdqblk is a superset of struct if_dqblk */ copy_to_if_dqblk((struct if_dqblk *)&idq, &fdq); idq.dqb_id = from_kqid(current_user_ns(), qid); if (copy_to_user(addr, &idq, sizeof(idq))) return -EFAULT; return 0; } static void copy_from_if_dqblk(struct qc_dqblk *dst, struct if_dqblk *src) { dst->d_spc_hardlimit = qbtos(src->dqb_bhardlimit); dst->d_spc_softlimit = qbtos(src->dqb_bsoftlimit); dst->d_space = src->dqb_curspace; dst->d_ino_hardlimit = src->dqb_ihardlimit; dst->d_ino_softlimit = src->dqb_isoftlimit; dst->d_ino_count = src->dqb_curinodes; dst->d_spc_timer = src->dqb_btime; dst->d_ino_timer = src->dqb_itime; dst->d_fieldmask = 0; if (src->dqb_valid & QIF_BLIMITS) dst->d_fieldmask |= QC_SPC_SOFT | QC_SPC_HARD; if (src->dqb_valid & QIF_SPACE) dst->d_fieldmask |= QC_SPACE; if (src->dqb_valid & QIF_ILIMITS) dst->d_fieldmask |= QC_INO_SOFT | QC_INO_HARD; if (src->dqb_valid & QIF_INODES) dst->d_fieldmask |= QC_INO_COUNT; if (src->dqb_valid & QIF_BTIME) dst->d_fieldmask |= QC_SPC_TIMER; if (src->dqb_valid & QIF_ITIME) dst->d_fieldmask |= QC_INO_TIMER; } static int quota_setquota(struct super_block *sb, int type, qid_t id, void __user *addr) { struct qc_dqblk fdq; struct if_dqblk idq; struct kqid qid; if (compat_need_64bit_alignment_fixup()) { struct compat_if_dqblk __user *compat_dqblk = addr; if (copy_from_user(&idq, compat_dqblk, sizeof(*compat_dqblk)) || get_user(idq.dqb_valid, &compat_dqblk->dqb_valid)) return -EFAULT; } else { if (copy_from_user(&idq, addr, sizeof(idq))) return -EFAULT; } if (!sb->s_qcop->set_dqblk) return -ENOSYS; qid = make_kqid(current_user_ns(), type, id); if (!qid_has_mapping(sb->s_user_ns, qid)) return -EINVAL; copy_from_if_dqblk(&fdq, &idq); return sb->s_qcop->set_dqblk(sb, qid, &fdq); } static int quota_enable(struct super_block *sb, void __user *addr) { __u32 flags; if (copy_from_user(&flags, addr, sizeof(flags))) return -EFAULT; if (!sb->s_qcop->quota_enable) return -ENOSYS; return sb->s_qcop->quota_enable(sb, flags); } static int quota_disable(struct super_block *sb, void __user *addr) { __u32 flags; if (copy_from_user(&flags, addr, sizeof(flags))) return -EFAULT; if (!sb->s_qcop->quota_disable) return -ENOSYS; return sb->s_qcop->quota_disable(sb, flags); } static int quota_state_to_flags(struct qc_state *state) { int flags = 0; if (state->s_state[USRQUOTA].flags & QCI_ACCT_ENABLED) flags |= FS_QUOTA_UDQ_ACCT; if (state->s_state[USRQUOTA].flags & QCI_LIMITS_ENFORCED) flags |= FS_QUOTA_UDQ_ENFD; if (state->s_state[GRPQUOTA].flags & QCI_ACCT_ENABLED) flags |= FS_QUOTA_GDQ_ACCT; if (state->s_state[GRPQUOTA].flags & QCI_LIMITS_ENFORCED) flags |= FS_QUOTA_GDQ_ENFD; if (state->s_state[PRJQUOTA].flags & QCI_ACCT_ENABLED) flags |= FS_QUOTA_PDQ_ACCT; if (state->s_state[PRJQUOTA].flags & QCI_LIMITS_ENFORCED) flags |= FS_QUOTA_PDQ_ENFD; return flags; } static int quota_getstate(struct super_block *sb, int type, struct fs_quota_stat *fqs) { struct qc_state state; int ret; memset(&state, 0, sizeof (struct qc_state)); ret = sb->s_qcop->get_state(sb, &state); if (ret < 0) return ret; memset(fqs, 0, sizeof(*fqs)); fqs->qs_version = FS_QSTAT_VERSION; fqs->qs_flags = quota_state_to_flags(&state); /* No quota enabled? */ if (!fqs->qs_flags) return -ENOSYS; fqs->qs_incoredqs = state.s_incoredqs; fqs->qs_btimelimit = state.s_state[type].spc_timelimit; fqs->qs_itimelimit = state.s_state[type].ino_timelimit; fqs->qs_rtbtimelimit = state.s_state[type].rt_spc_timelimit; fqs->qs_bwarnlimit = state.s_state[type].spc_warnlimit; fqs->qs_iwarnlimit = state.s_state[type].ino_warnlimit; /* Inodes may be allocated even if inactive; copy out if present */ if (state.s_state[USRQUOTA].ino) { fqs->qs_uquota.qfs_ino = state.s_state[USRQUOTA].ino; fqs->qs_uquota.qfs_nblks = state.s_state[USRQUOTA].blocks; fqs->qs_uquota.qfs_nextents = state.s_state[USRQUOTA].nextents; } if (state.s_state[GRPQUOTA].ino) { fqs->qs_gquota.qfs_ino = state.s_state[GRPQUOTA].ino; fqs->qs_gquota.qfs_nblks = state.s_state[GRPQUOTA].blocks; fqs->qs_gquota.qfs_nextents = state.s_state[GRPQUOTA].nextents; } if (state.s_state[PRJQUOTA].ino) { /* * Q_XGETQSTAT doesn't have room for both group and project * quotas. So, allow the project quota values to be copied out * only if there is no group quota information available. */ if (!(state.s_state[GRPQUOTA].flags & QCI_ACCT_ENABLED)) { fqs->qs_gquota.qfs_ino = state.s_state[PRJQUOTA].ino; fqs->qs_gquota.qfs_nblks = state.s_state[PRJQUOTA].blocks; fqs->qs_gquota.qfs_nextents = state.s_state[PRJQUOTA].nextents; } } return 0; } static int compat_copy_fs_qfilestat(struct compat_fs_qfilestat __user *to, struct fs_qfilestat *from) { if (copy_to_user(to, from, sizeof(*to)) || put_user(from->qfs_nextents, &to->qfs_nextents)) return -EFAULT; return 0; } static int compat_copy_fs_quota_stat(struct compat_fs_quota_stat __user *to, struct fs_quota_stat *from) { if (put_user(from->qs_version, &to->qs_version) || put_user(from->qs_flags, &to->qs_flags) || put_user(from->qs_pad, &to->qs_pad) || compat_copy_fs_qfilestat(&to->qs_uquota, &from->qs_uquota) || compat_copy_fs_qfilestat(&to->qs_gquota, &from->qs_gquota) || put_user(from->qs_incoredqs, &to->qs_incoredqs) || put_user(from->qs_btimelimit, &to->qs_btimelimit) || put_user(from->qs_itimelimit, &to->qs_itimelimit) || put_user(from->qs_rtbtimelimit, &to->qs_rtbtimelimit) || put_user(from->qs_bwarnlimit, &to->qs_bwarnlimit) || put_user(from->qs_iwarnlimit, &to->qs_iwarnlimit)) return -EFAULT; return 0; } static int quota_getxstate(struct super_block *sb, int type, void __user *addr) { struct fs_quota_stat fqs; int ret; if (!sb->s_qcop->get_state) return -ENOSYS; ret = quota_getstate(sb, type, &fqs); if (ret) return ret; if (compat_need_64bit_alignment_fixup()) return compat_copy_fs_quota_stat(addr, &fqs); if (copy_to_user(addr, &fqs, sizeof(fqs))) return -EFAULT; return 0; } static int quota_getstatev(struct super_block *sb, int type, struct fs_quota_statv *fqs) { struct qc_state state; int ret; memset(&state, 0, sizeof (struct qc_state)); ret = sb->s_qcop->get_state(sb, &state); if (ret < 0) return ret; memset(fqs, 0, sizeof(*fqs)); fqs->qs_version = FS_QSTAT_VERSION; fqs->qs_flags = quota_state_to_flags(&state); /* No quota enabled? */ if (!fqs->qs_flags) return -ENOSYS; fqs->qs_incoredqs = state.s_incoredqs; fqs->qs_btimelimit = state.s_state[type].spc_timelimit; fqs->qs_itimelimit = state.s_state[type].ino_timelimit; fqs->qs_rtbtimelimit = state.s_state[type].rt_spc_timelimit; fqs->qs_bwarnlimit = state.s_state[type].spc_warnlimit; fqs->qs_iwarnlimit = state.s_state[type].ino_warnlimit; fqs->qs_rtbwarnlimit = state.s_state[type].rt_spc_warnlimit; /* Inodes may be allocated even if inactive; copy out if present */ if (state.s_state[USRQUOTA].ino) { fqs->qs_uquota.qfs_ino = state.s_state[USRQUOTA].ino; fqs->qs_uquota.qfs_nblks = state.s_state[USRQUOTA].blocks; fqs->qs_uquota.qfs_nextents = state.s_state[USRQUOTA].nextents; } if (state.s_state[GRPQUOTA].ino) { fqs->qs_gquota.qfs_ino = state.s_state[GRPQUOTA].ino; fqs->qs_gquota.qfs_nblks = state.s_state[GRPQUOTA].blocks; fqs->qs_gquota.qfs_nextents = state.s_state[GRPQUOTA].nextents; } if (state.s_state[PRJQUOTA].ino) { fqs->qs_pquota.qfs_ino = state.s_state[PRJQUOTA].ino; fqs->qs_pquota.qfs_nblks = state.s_state[PRJQUOTA].blocks; fqs->qs_pquota.qfs_nextents = state.s_state[PRJQUOTA].nextents; } return 0; } static int quota_getxstatev(struct super_block *sb, int type, void __user *addr) { struct fs_quota_statv fqs; int ret; if (!sb->s_qcop->get_state) return -ENOSYS; memset(&fqs, 0, sizeof(fqs)); if (copy_from_user(&fqs, addr, 1)) /* Just read qs_version */ return -EFAULT; /* If this kernel doesn't support user specified version, fail */ switch (fqs.qs_version) { case FS_QSTATV_VERSION1: break; default: return -EINVAL; } ret = quota_getstatev(sb, type, &fqs); if (!ret && copy_to_user(addr, &fqs, sizeof(fqs))) return -EFAULT; return ret; } /* * XFS defines BBTOB and BTOBB macros inside fs/xfs/ and we cannot move them * out of there as xfsprogs rely on definitions being in that header file. So * just define same functions here for quota purposes. */ #define XFS_BB_SHIFT 9 static inline u64 quota_bbtob(u64 blocks) { return blocks << XFS_BB_SHIFT; } static inline u64 quota_btobb(u64 bytes) { return (bytes + (1 << XFS_BB_SHIFT) - 1) >> XFS_BB_SHIFT; } static inline s64 copy_from_xfs_dqblk_ts(const struct fs_disk_quota *d, __s32 timer, __s8 timer_hi) { if (d->d_fieldmask & FS_DQ_BIGTIME) return (u32)timer | (s64)timer_hi << 32; return timer; } static void copy_from_xfs_dqblk(struct qc_dqblk *dst, struct fs_disk_quota *src) { dst->d_spc_hardlimit = quota_bbtob(src->d_blk_hardlimit); dst->d_spc_softlimit = quota_bbtob(src->d_blk_softlimit); dst->d_ino_hardlimit = src->d_ino_hardlimit; dst->d_ino_softlimit = src->d_ino_softlimit; dst->d_space = quota_bbtob(src->d_bcount); dst->d_ino_count = src->d_icount; dst->d_ino_timer = copy_from_xfs_dqblk_ts(src, src->d_itimer, src->d_itimer_hi); dst->d_spc_timer = copy_from_xfs_dqblk_ts(src, src->d_btimer, src->d_btimer_hi); dst->d_ino_warns = src->d_iwarns; dst->d_spc_warns = src->d_bwarns; dst->d_rt_spc_hardlimit = quota_bbtob(src->d_rtb_hardlimit); dst->d_rt_spc_softlimit = quota_bbtob(src->d_rtb_softlimit); dst->d_rt_space = quota_bbtob(src->d_rtbcount); dst->d_rt_spc_timer = copy_from_xfs_dqblk_ts(src, src->d_rtbtimer, src->d_rtbtimer_hi); dst->d_rt_spc_warns = src->d_rtbwarns; dst->d_fieldmask = 0; if (src->d_fieldmask & FS_DQ_ISOFT) dst->d_fieldmask |= QC_INO_SOFT; if (src->d_fieldmask & FS_DQ_IHARD) dst->d_fieldmask |= QC_INO_HARD; if (src->d_fieldmask & FS_DQ_BSOFT) dst->d_fieldmask |= QC_SPC_SOFT; if (src->d_fieldmask & FS_DQ_BHARD) dst->d_fieldmask |= QC_SPC_HARD; if (src->d_fieldmask & FS_DQ_RTBSOFT) dst->d_fieldmask |= QC_RT_SPC_SOFT; if (src->d_fieldmask & FS_DQ_RTBHARD) dst->d_fieldmask |= QC_RT_SPC_HARD; if (src->d_fieldmask & FS_DQ_BTIMER) dst->d_fieldmask |= QC_SPC_TIMER; if (src->d_fieldmask & FS_DQ_ITIMER) dst->d_fieldmask |= QC_INO_TIMER; if (src->d_fieldmask & FS_DQ_RTBTIMER) dst->d_fieldmask |= QC_RT_SPC_TIMER; if (src->d_fieldmask & FS_DQ_BWARNS) dst->d_fieldmask |= QC_SPC_WARNS; if (src->d_fieldmask & FS_DQ_IWARNS) dst->d_fieldmask |= QC_INO_WARNS; if (src->d_fieldmask & FS_DQ_RTBWARNS) dst->d_fieldmask |= QC_RT_SPC_WARNS; if (src->d_fieldmask & FS_DQ_BCOUNT) dst->d_fieldmask |= QC_SPACE; if (src->d_fieldmask & FS_DQ_ICOUNT) dst->d_fieldmask |= QC_INO_COUNT; if (src->d_fieldmask & FS_DQ_RTBCOUNT) dst->d_fieldmask |= QC_RT_SPACE; } static void copy_qcinfo_from_xfs_dqblk(struct qc_info *dst, struct fs_disk_quota *src) { memset(dst, 0, sizeof(*dst)); dst->i_spc_timelimit = src->d_btimer; dst->i_ino_timelimit = src->d_itimer; dst->i_rt_spc_timelimit = src->d_rtbtimer; dst->i_ino_warnlimit = src->d_iwarns; dst->i_spc_warnlimit = src->d_bwarns; dst->i_rt_spc_warnlimit = src->d_rtbwarns; if (src->d_fieldmask & FS_DQ_BWARNS) dst->i_fieldmask |= QC_SPC_WARNS; if (src->d_fieldmask & FS_DQ_IWARNS) dst->i_fieldmask |= QC_INO_WARNS; if (src->d_fieldmask & FS_DQ_RTBWARNS) dst->i_fieldmask |= QC_RT_SPC_WARNS; if (src->d_fieldmask & FS_DQ_BTIMER) dst->i_fieldmask |= QC_SPC_TIMER; if (src->d_fieldmask & FS_DQ_ITIMER) dst->i_fieldmask |= QC_INO_TIMER; if (src->d_fieldmask & FS_DQ_RTBTIMER) dst->i_fieldmask |= QC_RT_SPC_TIMER; } static int quota_setxquota(struct super_block *sb, int type, qid_t id, void __user *addr) { struct fs_disk_quota fdq; struct qc_dqblk qdq; struct kqid qid; if (copy_from_user(&fdq, addr, sizeof(fdq))) return -EFAULT; if (!sb->s_qcop->set_dqblk) return -ENOSYS; qid = make_kqid(current_user_ns(), type, id); if (!qid_has_mapping(sb->s_user_ns, qid)) return -EINVAL; /* Are we actually setting timer / warning limits for all users? */ if (from_kqid(sb->s_user_ns, qid) == 0 && fdq.d_fieldmask & (FS_DQ_WARNS_MASK | FS_DQ_TIMER_MASK)) { struct qc_info qinfo; int ret; if (!sb->s_qcop->set_info) return -EINVAL; copy_qcinfo_from_xfs_dqblk(&qinfo, &fdq); ret = sb->s_qcop->set_info(sb, type, &qinfo); if (ret) return ret; /* These are already done */ fdq.d_fieldmask &= ~(FS_DQ_WARNS_MASK | FS_DQ_TIMER_MASK); } copy_from_xfs_dqblk(&qdq, &fdq); return sb->s_qcop->set_dqblk(sb, qid, &qdq); } static inline void copy_to_xfs_dqblk_ts(const struct fs_disk_quota *d, __s32 *timer_lo, __s8 *timer_hi, s64 timer) { *timer_lo = timer; if (d->d_fieldmask & FS_DQ_BIGTIME) *timer_hi = timer >> 32; } static inline bool want_bigtime(s64 timer) { return timer > S32_MAX || timer < S32_MIN; } static void copy_to_xfs_dqblk(struct fs_disk_quota *dst, struct qc_dqblk *src, int type, qid_t id) { memset(dst, 0, sizeof(*dst)); if (want_bigtime(src->d_ino_timer) || want_bigtime(src->d_spc_timer) || want_bigtime(src->d_rt_spc_timer)) dst->d_fieldmask |= FS_DQ_BIGTIME; dst->d_version = FS_DQUOT_VERSION; dst->d_id = id; if (type == USRQUOTA) dst->d_flags = FS_USER_QUOTA; else if (type == PRJQUOTA) dst->d_flags = FS_PROJ_QUOTA; else dst->d_flags = FS_GROUP_QUOTA; dst->d_blk_hardlimit = quota_btobb(src->d_spc_hardlimit); dst->d_blk_softlimit = quota_btobb(src->d_spc_softlimit); dst->d_ino_hardlimit = src->d_ino_hardlimit; dst->d_ino_softlimit = src->d_ino_softlimit; dst->d_bcount = quota_btobb(src->d_space); dst->d_icount = src->d_ino_count; copy_to_xfs_dqblk_ts(dst, &dst->d_itimer, &dst->d_itimer_hi, src->d_ino_timer); copy_to_xfs_dqblk_ts(dst, &dst->d_btimer, &dst->d_btimer_hi, src->d_spc_timer); dst->d_iwarns = src->d_ino_warns; dst->d_bwarns = src->d_spc_warns; dst->d_rtb_hardlimit = quota_btobb(src->d_rt_spc_hardlimit); dst->d_rtb_softlimit = quota_btobb(src->d_rt_spc_softlimit); dst->d_rtbcount = quota_btobb(src->d_rt_space); copy_to_xfs_dqblk_ts(dst, &dst->d_rtbtimer, &dst->d_rtbtimer_hi, src->d_rt_spc_timer); dst->d_rtbwarns = src->d_rt_spc_warns; } static int quota_getxquota(struct super_block *sb, int type, qid_t id, void __user *addr) { struct fs_disk_quota fdq; struct qc_dqblk qdq; struct kqid qid; int ret; if (!sb->s_qcop->get_dqblk) return -ENOSYS; qid = make_kqid(current_user_ns(), type, id); if (!qid_has_mapping(sb->s_user_ns, qid)) return -EINVAL; ret = sb->s_qcop->get_dqblk(sb, qid, &qdq); if (ret) return ret; copy_to_xfs_dqblk(&fdq, &qdq, type, id); if (copy_to_user(addr, &fdq, sizeof(fdq))) return -EFAULT; return ret; } /* * Return quota for next active quota >= this id, if any exists, * otherwise return -ENOENT via ->get_nextdqblk. */ static int quota_getnextxquota(struct super_block *sb, int type, qid_t id, void __user *addr) { struct fs_disk_quota fdq; struct qc_dqblk qdq; struct kqid qid; qid_t id_out; int ret; if (!sb->s_qcop->get_nextdqblk) return -ENOSYS; qid = make_kqid(current_user_ns(), type, id); if (!qid_has_mapping(sb->s_user_ns, qid)) return -EINVAL; ret = sb->s_qcop->get_nextdqblk(sb, &qid, &qdq); if (ret) return ret; id_out = from_kqid(current_user_ns(), qid); copy_to_xfs_dqblk(&fdq, &qdq, type, id_out); if (copy_to_user(addr, &fdq, sizeof(fdq))) return -EFAULT; return ret; } static int quota_rmxquota(struct super_block *sb, void __user *addr) { __u32 flags; if (copy_from_user(&flags, addr, sizeof(flags))) return -EFAULT; if (!sb->s_qcop->rm_xquota) return -ENOSYS; return sb->s_qcop->rm_xquota(sb, flags); } /* Copy parameters and call proper function */ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, void __user *addr, const struct path *path) { int ret; type = array_index_nospec(type, MAXQUOTAS); /* * Quota not supported on this fs? Check this before s_quota_types * since they needn't be set if quota is not supported at all. */ if (!sb->s_qcop) return -ENOSYS; if (!(sb->s_quota_types & (1 << type))) return -EINVAL; ret = check_quotactl_permission(sb, type, cmd, id); if (ret < 0) return ret; switch (cmd) { case Q_QUOTAON: return quota_quotaon(sb, type, id, path); case Q_QUOTAOFF: return quota_quotaoff(sb, type); case Q_GETFMT: return quota_getfmt(sb, type, addr); case Q_GETINFO: return quota_getinfo(sb, type, addr); case Q_SETINFO: return quota_setinfo(sb, type, addr); case Q_GETQUOTA: return quota_getquota(sb, type, id, addr); case Q_GETNEXTQUOTA: return quota_getnextquota(sb, type, id, addr); case Q_SETQUOTA: return quota_setquota(sb, type, id, addr); case Q_SYNC: if (!sb->s_qcop->quota_sync) return -ENOSYS; return sb->s_qcop->quota_sync(sb, type); case Q_XQUOTAON: return quota_enable(sb, addr); case Q_XQUOTAOFF: return quota_disable(sb, addr); case Q_XQUOTARM: return quota_rmxquota(sb, addr); case Q_XGETQSTAT: return quota_getxstate(sb, type, addr); case Q_XGETQSTATV: return quota_getxstatev(sb, type, addr); case Q_XSETQLIM: return quota_setxquota(sb, type, id, addr); case Q_XGETQUOTA: return quota_getxquota(sb, type, id, addr); case Q_XGETNEXTQUOTA: return quota_getnextxquota(sb, type, id, addr); case Q_XQUOTASYNC: if (sb_rdonly(sb)) return -EROFS; /* XFS quotas are fully coherent now, making this call a noop */ return 0; default: return -EINVAL; } } /* Return 1 if 'cmd' will block on frozen filesystem */ static int quotactl_cmd_write(int cmd) { /* * We cannot allow Q_GETQUOTA and Q_GETNEXTQUOTA without write access * as dquot_acquire() may allocate space for new structure and OCFS2 * needs to increment on-disk use count. */ switch (cmd) { case Q_GETFMT: case Q_GETINFO: case Q_SYNC: case Q_XGETQSTAT: case Q_XGETQSTATV: case Q_XGETQUOTA: case Q_XGETNEXTQUOTA: case Q_XQUOTASYNC: return 0; } return 1; } /* Return true if quotactl command is manipulating quota on/off state */ static bool quotactl_cmd_onoff(int cmd) { return (cmd == Q_QUOTAON) || (cmd == Q_QUOTAOFF) || (cmd == Q_XQUOTAON) || (cmd == Q_XQUOTAOFF); } /* * look up a superblock on which quota ops will be performed * - use the name of a block device to find the superblock thereon */ static struct super_block *quotactl_block(const char __user *special, int cmd) { #ifdef CONFIG_BLOCK struct super_block *sb; struct filename *tmp = getname(special); bool excl = false, thawed = false; int error; dev_t dev; if (IS_ERR(tmp)) return ERR_CAST(tmp); error = lookup_bdev(tmp->name, &dev); putname(tmp); if (error) return ERR_PTR(error); if (quotactl_cmd_onoff(cmd)) { excl = true; thawed = true; } else if (quotactl_cmd_write(cmd)) { thawed = true; } retry: sb = user_get_super(dev, excl); if (!sb) return ERR_PTR(-ENODEV); if (thawed && sb->s_writers.frozen != SB_UNFROZEN) { if (excl) up_write(&sb->s_umount); else up_read(&sb->s_umount); /* Wait for sb to unfreeze */ sb_start_write(sb); sb_end_write(sb); put_super(sb); goto retry; } return sb; #else return ERR_PTR(-ENODEV); #endif } /* * This is the system call interface. This communicates with * the user-level programs. Currently this only supports diskquota * calls. Maybe we need to add the process quotas etc. in the future, * but we probably should use rlimits for that. */ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special, qid_t, id, void __user *, addr) { uint cmds, type; struct super_block *sb = NULL; struct path path, *pathp = NULL; int ret; cmds = cmd >> SUBCMDSHIFT; type = cmd & SUBCMDMASK; if (type >= MAXQUOTAS) return -EINVAL; /* * As a special case Q_SYNC can be called without a specific device. * It will iterate all superblocks that have quota enabled and call * the sync action on each of them. */ if (!special) { if (cmds == Q_SYNC) return quota_sync_all(type); return -ENODEV; } /* * Path for quotaon has to be resolved before grabbing superblock * because that gets s_umount sem which is also possibly needed by path * resolution (think about autofs) and thus deadlocks could arise. */ if (cmds == Q_QUOTAON) { ret = user_path_at(AT_FDCWD, addr, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); if (ret) pathp = ERR_PTR(ret); else pathp = &path; } sb = quotactl_block(special, cmds); if (IS_ERR(sb)) { ret = PTR_ERR(sb); goto out; } ret = do_quotactl(sb, type, cmds, id, addr, pathp); if (!quotactl_cmd_onoff(cmds)) drop_super(sb); else drop_super_exclusive(sb); out: if (pathp && !IS_ERR(pathp)) path_put(pathp); return ret; } SYSCALL_DEFINE4(quotactl_fd, unsigned int, fd, unsigned int, cmd, qid_t, id, void __user *, addr) { struct super_block *sb; unsigned int cmds = cmd >> SUBCMDSHIFT; unsigned int type = cmd & SUBCMDMASK; struct fd f; int ret; f = fdget_raw(fd); if (!f.file) return -EBADF; ret = -EINVAL; if (type >= MAXQUOTAS) goto out; if (quotactl_cmd_write(cmds)) { ret = mnt_want_write(f.file->f_path.mnt); if (ret) goto out; } sb = f.file->f_path.mnt->mnt_sb; if (quotactl_cmd_onoff(cmds)) down_write(&sb->s_umount); else down_read(&sb->s_umount); ret = do_quotactl(sb, type, cmds, id, addr, ERR_PTR(-EINVAL)); if (quotactl_cmd_onoff(cmds)) up_write(&sb->s_umount); else up_read(&sb->s_umount); if (quotactl_cmd_write(cmds)) mnt_drop_write(f.file->f_path.mnt); out: fdput(f); return ret; } |
38 38 33 33 33 33 3 3 3 437 437 5 5 441 439 48 48 1 47 48 45 1 44 1 1 2 2 437 413 24 413 24 437 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 | // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/proc/net.c * * Copyright (C) 2007 * * Author: Eric Biederman <ebiederm@xmission.com> * * proc net directory handling functions */ #include <linux/errno.h> #include <linux/time.h> #include <linux/proc_fs.h> #include <linux/stat.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/sched.h> #include <linux/sched/task.h> #include <linux/module.h> #include <linux/bitops.h> #include <linux/mount.h> #include <linux/nsproxy.h> #include <linux/uidgid.h> #include <net/net_namespace.h> #include <linux/seq_file.h> #include "internal.h" static inline struct net *PDE_NET(struct proc_dir_entry *pde) { return pde->parent->data; } static struct net *get_proc_net(const struct inode *inode) { return maybe_get_net(PDE_NET(PDE(inode))); } static int seq_open_net(struct inode *inode, struct file *file) { unsigned int state_size = PDE(inode)->state_size; struct seq_net_private *p; struct net *net; WARN_ON_ONCE(state_size < sizeof(*p)); if (file->f_mode & FMODE_WRITE && !PDE(inode)->write) return -EACCES; net = get_proc_net(inode); if (!net) return -ENXIO; p = __seq_open_private(file, PDE(inode)->seq_ops, state_size); if (!p) { put_net(net); return -ENOMEM; } #ifdef CONFIG_NET_NS p->net = net; netns_tracker_alloc(net, &p->ns_tracker, GFP_KERNEL); #endif return 0; } static void seq_file_net_put_net(struct seq_file *seq) { #ifdef CONFIG_NET_NS struct seq_net_private *priv = seq->private; put_net_track(priv->net, &priv->ns_tracker); #else put_net(&init_net); #endif } static int seq_release_net(struct inode *ino, struct file *f) { struct seq_file *seq = f->private_data; seq_file_net_put_net(seq); seq_release_private(ino, f); return 0; } static const struct proc_ops proc_net_seq_ops = { .proc_open = seq_open_net, .proc_read = seq_read, .proc_write = proc_simple_write, .proc_lseek = seq_lseek, .proc_release = seq_release_net, }; int bpf_iter_init_seq_net(void *priv_data, struct bpf_iter_aux_info *aux) { #ifdef CONFIG_NET_NS struct seq_net_private *p = priv_data; p->net = get_net_track(current->nsproxy->net_ns, &p->ns_tracker, GFP_KERNEL); #endif return 0; } void bpf_iter_fini_seq_net(void *priv_data) { #ifdef CONFIG_NET_NS struct seq_net_private *p = priv_data; put_net_track(p->net, &p->ns_tracker); #endif } struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct seq_operations *ops, unsigned int state_size, void *data) { struct proc_dir_entry *p; p = proc_create_reg(name, mode, &parent, data); if (!p) return NULL; pde_force_lookup(p); p->proc_ops = &proc_net_seq_ops; p->seq_ops = ops; p->state_size = state_size; return proc_register(parent, p); } EXPORT_SYMBOL_GPL(proc_create_net_data); /** * proc_create_net_data_write - Create a writable net_ns-specific proc file * @name: The name of the file. * @mode: The file's access mode. * @parent: The parent directory in which to create. * @ops: The seq_file ops with which to read the file. * @write: The write method with which to 'modify' the file. * @data: Data for retrieval by pde_data(). * * Create a network namespaced proc file in the @parent directory with the * specified @name and @mode that allows reading of a file that displays a * series of elements and also provides for the file accepting writes that have * some arbitrary effect. * * The functions in the @ops table are used to iterate over items to be * presented and extract the readable content using the seq_file interface. * * The @write function is called with the data copied into a kernel space * scratch buffer and has a NUL appended for convenience. The buffer may be * modified by the @write function. @write should return 0 on success. * * The @data value is accessible from the @show and @write functions by calling * pde_data() on the file inode. The network namespace must be accessed by * calling seq_file_net() on the seq_file struct. */ struct proc_dir_entry *proc_create_net_data_write(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct seq_operations *ops, proc_write_t write, unsigned int state_size, void *data) { struct proc_dir_entry *p; p = proc_create_reg(name, mode, &parent, data); if (!p) return NULL; pde_force_lookup(p); p->proc_ops = &proc_net_seq_ops; p->seq_ops = ops; p->state_size = state_size; p->write = write; return proc_register(parent, p); } EXPORT_SYMBOL_GPL(proc_create_net_data_write); static int single_open_net(struct inode *inode, struct file *file) { struct proc_dir_entry *de = PDE(inode); struct net *net; int err; net = get_proc_net(inode); if (!net) return -ENXIO; err = single_open(file, de->single_show, net); if (err) put_net(net); return err; } static int single_release_net(struct inode *ino, struct file *f) { struct seq_file *seq = f->private_data; put_net(seq->private); return single_release(ino, f); } static const struct proc_ops proc_net_single_ops = { .proc_open = single_open_net, .proc_read = seq_read, .proc_write = proc_simple_write, .proc_lseek = seq_lseek, .proc_release = single_release_net, }; struct proc_dir_entry *proc_create_net_single(const char *name, umode_t mode, struct proc_dir_entry *parent, int (*show)(struct seq_file *, void *), void *data) { struct proc_dir_entry *p; p = proc_create_reg(name, mode, &parent, data); if (!p) return NULL; pde_force_lookup(p); p->proc_ops = &proc_net_single_ops; p->single_show = show; return proc_register(parent, p); } EXPORT_SYMBOL_GPL(proc_create_net_single); /** * proc_create_net_single_write - Create a writable net_ns-specific proc file * @name: The name of the file. * @mode: The file's access mode. * @parent: The parent directory in which to create. * @show: The seqfile show method with which to read the file. * @write: The write method with which to 'modify' the file. * @data: Data for retrieval by pde_data(). * * Create a network-namespaced proc file in the @parent directory with the * specified @name and @mode that allows reading of a file that displays a * single element rather than a series and also provides for the file accepting * writes that have some arbitrary effect. * * The @show function is called to extract the readable content via the * seq_file interface. * * The @write function is called with the data copied into a kernel space * scratch buffer and has a NUL appended for convenience. The buffer may be * modified by the @write function. @write should return 0 on success. * * The @data value is accessible from the @show and @write functions by calling * pde_data() on the file inode. The network namespace must be accessed by * calling seq_file_single_net() on the seq_file struct. */ struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mode, struct proc_dir_entry *parent, int (*show)(struct seq_file *, void *), proc_write_t write, void *data) { struct proc_dir_entry *p; p = proc_create_reg(name, mode, &parent, data); if (!p) return NULL; pde_force_lookup(p); p->proc_ops = &proc_net_single_ops; p->single_show = show; p->write = write; return proc_register(parent, p); } EXPORT_SYMBOL_GPL(proc_create_net_single_write); static struct net *get_proc_task_net(struct inode *dir) { struct task_struct *task; struct nsproxy *ns; struct net *net = NULL; rcu_read_lock(); task = pid_task(proc_pid(dir), PIDTYPE_PID); if (task != NULL) { task_lock(task); ns = task->nsproxy; if (ns != NULL) net = get_net(ns->net_ns); task_unlock(task); } rcu_read_unlock(); return net; } static struct dentry *proc_tgid_net_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct dentry *de; struct net *net; de = ERR_PTR(-ENOENT); net = get_proc_task_net(dir); if (net != NULL) { de = proc_lookup_de(dir, dentry, net->proc_net); put_net(net); } return de; } static int proc_tgid_net_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct net *net; net = get_proc_task_net(inode); generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); if (net != NULL) { stat->nlink = net->proc_net->nlink; put_net(net); } return 0; } const struct inode_operations proc_net_inode_operations = { .lookup = proc_tgid_net_lookup, .getattr = proc_tgid_net_getattr, .setattr = proc_setattr, }; static int proc_tgid_net_readdir(struct file *file, struct dir_context *ctx) { int ret; struct net *net; ret = -EINVAL; net = get_proc_task_net(file_inode(file)); if (net != NULL) { ret = proc_readdir_de(file, ctx, net->proc_net); put_net(net); } return ret; } const struct file_operations proc_net_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .iterate_shared = proc_tgid_net_readdir, }; static __net_init int proc_net_ns_init(struct net *net) { struct proc_dir_entry *netd, *net_statd; kuid_t uid; kgid_t gid; int err; /* * This PDE acts only as an anchor for /proc/${pid}/net hierarchy. * Corresponding inode (PDE(inode) == net->proc_net) is never * instantiated therefore blanket zeroing is fine. * net->proc_net_stat inode is instantiated normally. */ err = -ENOMEM; netd = kmem_cache_zalloc(proc_dir_entry_cache, GFP_KERNEL); if (!netd) goto out; netd->subdir = RB_ROOT; netd->data = net; netd->nlink = 2; netd->namelen = 3; netd->parent = &proc_root; netd->name = netd->inline_name; memcpy(netd->name, "net", 4); uid = make_kuid(net->user_ns, 0); if (!uid_valid(uid)) uid = netd->uid; gid = make_kgid(net->user_ns, 0); if (!gid_valid(gid)) gid = netd->gid; proc_set_user(netd, uid, gid); /* Seed dentry revalidation for /proc/${pid}/net */ pde_force_lookup(netd); err = -EEXIST; net_statd = proc_net_mkdir(net, "stat", netd); if (!net_statd) goto free_net; net->proc_net = netd; net->proc_net_stat = net_statd; return 0; free_net: pde_free(netd); out: return err; } static __net_exit void proc_net_ns_exit(struct net *net) { remove_proc_entry("stat", net->proc_net); pde_free(net->proc_net); } static struct pernet_operations __net_initdata proc_net_ns_ops = { .init = proc_net_ns_init, .exit = proc_net_ns_exit, }; int __init proc_net_init(void) { proc_symlink("net", NULL, "self/net"); return register_pernet_subsys(&proc_net_ns_ops); } |
37 37 37 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2012 Smith Micro Software, Inc. * Copyright (c) 2012 Bjørn Mork <bjorn@mork.no> * * This driver is based on and reuse most of cdc_ncm, which is * Copyright (C) ST-Ericsson 2010-2012 */ #include <linux/module.h> #include <linux/netdevice.h> #include <linux/ethtool.h> #include <linux/if_vlan.h> #include <linux/ip.h> #include <linux/mii.h> #include <linux/usb.h> #include <linux/usb/cdc.h> #include <linux/usb/usbnet.h> #include <linux/usb/cdc-wdm.h> #include <linux/usb/cdc_ncm.h> #include <net/ipv6.h> #include <net/addrconf.h> #include <net/ipv6_stubs.h> #include <net/ndisc.h> /* alternative VLAN for IP session 0 if not untagged */ #define MBIM_IPS0_VID 4094 /* driver specific data - must match cdc_ncm usage */ struct cdc_mbim_state { struct cdc_ncm_ctx *ctx; atomic_t pmcount; struct usb_driver *subdriver; unsigned long _unused; unsigned long flags; }; /* flags for the cdc_mbim_state.flags field */ enum cdc_mbim_flags { FLAG_IPS0_VLAN = 1 << 0, /* IP session 0 is tagged */ }; /* using a counter to merge subdriver requests with our own into a combined state */ static int cdc_mbim_manage_power(struct usbnet *dev, int on) { struct cdc_mbim_state *info = (void *)&dev->data; int rv = 0; dev_dbg(&dev->intf->dev, "%s() pmcount=%d, on=%d\n", __func__, atomic_read(&info->pmcount), on); if ((on && atomic_add_return(1, &info->pmcount) == 1) || (!on && atomic_dec_and_test(&info->pmcount))) { /* need autopm_get/put here to ensure the usbcore sees the new value */ rv = usb_autopm_get_interface(dev->intf); dev->intf->needs_remote_wakeup = on; if (!rv) usb_autopm_put_interface(dev->intf); } return 0; } static int cdc_mbim_wdm_manage_power(struct usb_interface *intf, int status) { struct usbnet *dev = usb_get_intfdata(intf); /* can be called while disconnecting */ if (!dev) return 0; return cdc_mbim_manage_power(dev, status); } static int cdc_mbim_rx_add_vid(struct net_device *netdev, __be16 proto, u16 vid) { struct usbnet *dev = netdev_priv(netdev); struct cdc_mbim_state *info = (void *)&dev->data; /* creation of this VLAN is a request to tag IP session 0 */ if (vid == MBIM_IPS0_VID) info->flags |= FLAG_IPS0_VLAN; else if (vid >= 512) /* we don't map these to MBIM session */ return -EINVAL; return 0; } static int cdc_mbim_rx_kill_vid(struct net_device *netdev, __be16 proto, u16 vid) { struct usbnet *dev = netdev_priv(netdev); struct cdc_mbim_state *info = (void *)&dev->data; /* this is a request for an untagged IP session 0 */ if (vid == MBIM_IPS0_VID) info->flags &= ~FLAG_IPS0_VLAN; return 0; } static const struct net_device_ops cdc_mbim_netdev_ops = { .ndo_open = usbnet_open, .ndo_stop = usbnet_stop, .ndo_start_xmit = usbnet_start_xmit, .ndo_tx_timeout = usbnet_tx_timeout, .ndo_get_stats64 = dev_get_tstats64, .ndo_change_mtu = cdc_ncm_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, .ndo_vlan_rx_add_vid = cdc_mbim_rx_add_vid, .ndo_vlan_rx_kill_vid = cdc_mbim_rx_kill_vid, }; /* Change the control interface altsetting and update the .driver_info * pointer if the matching entry after changing class codes points to * a different struct */ static int cdc_mbim_set_ctrlalt(struct usbnet *dev, struct usb_interface *intf, u8 alt) { struct usb_driver *driver = to_usb_driver(intf->dev.driver); const struct usb_device_id *id; struct driver_info *info; int ret; ret = usb_set_interface(dev->udev, intf->cur_altsetting->desc.bInterfaceNumber, alt); if (ret) return ret; id = usb_match_id(intf, driver->id_table); if (!id) return -ENODEV; info = (struct driver_info *)id->driver_info; if (info != dev->driver_info) { dev_dbg(&intf->dev, "driver_info updated to '%s'\n", info->description); dev->driver_info = info; } return 0; } static int cdc_mbim_bind(struct usbnet *dev, struct usb_interface *intf) { struct cdc_ncm_ctx *ctx; struct usb_driver *subdriver = ERR_PTR(-ENODEV); int ret = -ENODEV; u8 data_altsetting = 1; struct cdc_mbim_state *info = (void *)&dev->data; /* should we change control altsetting on a NCM/MBIM function? */ if (cdc_ncm_select_altsetting(intf) == CDC_NCM_COMM_ALTSETTING_MBIM) { data_altsetting = CDC_NCM_DATA_ALTSETTING_MBIM; ret = cdc_mbim_set_ctrlalt(dev, intf, CDC_NCM_COMM_ALTSETTING_MBIM); if (ret) goto err; ret = -ENODEV; } /* we will hit this for NCM/MBIM functions if prefer_mbim is false */ if (!cdc_ncm_comm_intf_is_mbim(intf->cur_altsetting)) goto err; ret = cdc_ncm_bind_common(dev, intf, data_altsetting, dev->driver_info->data); if (ret) goto err; ctx = info->ctx; /* The MBIM descriptor and the status endpoint are required */ if (ctx->mbim_desc && dev->status) subdriver = usb_cdc_wdm_register(ctx->control, &dev->status->desc, le16_to_cpu(ctx->mbim_desc->wMaxControlMessage), WWAN_PORT_MBIM, cdc_mbim_wdm_manage_power); if (IS_ERR(subdriver)) { ret = PTR_ERR(subdriver); cdc_ncm_unbind(dev, intf); goto err; } /* can't let usbnet use the interrupt endpoint */ dev->status = NULL; info->subdriver = subdriver; /* MBIM cannot do ARP */ dev->net->flags |= IFF_NOARP; /* no need to put the VLAN tci in the packet headers */ dev->net->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_FILTER; /* monitor VLAN additions and removals */ dev->net->netdev_ops = &cdc_mbim_netdev_ops; err: return ret; } static void cdc_mbim_unbind(struct usbnet *dev, struct usb_interface *intf) { struct cdc_mbim_state *info = (void *)&dev->data; struct cdc_ncm_ctx *ctx = info->ctx; /* disconnect subdriver from control interface */ if (info->subdriver && info->subdriver->disconnect) info->subdriver->disconnect(ctx->control); info->subdriver = NULL; /* let NCM unbind clean up both control and data interface */ cdc_ncm_unbind(dev, intf); } /* verify that the ethernet protocol is IPv4 or IPv6 */ static bool is_ip_proto(__be16 proto) { switch (proto) { case htons(ETH_P_IP): case htons(ETH_P_IPV6): return true; } return false; } static struct sk_buff *cdc_mbim_tx_fixup(struct usbnet *dev, struct sk_buff *skb, gfp_t flags) { struct sk_buff *skb_out; struct cdc_mbim_state *info = (void *)&dev->data; struct cdc_ncm_ctx *ctx = info->ctx; __le32 sign = cpu_to_le32(USB_CDC_MBIM_NDP16_IPS_SIGN); u16 tci = 0; bool is_ip; u8 *c; if (!ctx) goto error; if (skb) { if (skb->len <= ETH_HLEN) goto error; /* Some applications using e.g. packet sockets will * bypass the VLAN acceleration and create tagged * ethernet frames directly. We primarily look for * the accelerated out-of-band tag, but fall back if * required */ skb_reset_mac_header(skb); if (vlan_get_tag(skb, &tci) < 0 && skb->len > VLAN_ETH_HLEN && __vlan_get_tag(skb, &tci) == 0) { is_ip = is_ip_proto(vlan_eth_hdr(skb)->h_vlan_encapsulated_proto); skb_pull(skb, VLAN_ETH_HLEN); } else { is_ip = is_ip_proto(eth_hdr(skb)->h_proto); skb_pull(skb, ETH_HLEN); } /* Is IP session <0> tagged too? */ if (info->flags & FLAG_IPS0_VLAN) { /* drop all untagged packets */ if (!tci) goto error; /* map MBIM_IPS0_VID to IPS<0> */ if (tci == MBIM_IPS0_VID) tci = 0; } /* mapping VLANs to MBIM sessions: * no tag => IPS session <0> if !FLAG_IPS0_VLAN * 1 - 255 => IPS session <vlanid> * 256 - 511 => DSS session <vlanid - 256> * 512 - 4093 => unsupported, drop * 4094 => IPS session <0> if FLAG_IPS0_VLAN */ switch (tci & 0x0f00) { case 0x0000: /* VLAN ID 0 - 255 */ if (!is_ip) goto error; c = (u8 *)&sign; c[3] = tci; break; case 0x0100: /* VLAN ID 256 - 511 */ if (is_ip) goto error; sign = cpu_to_le32(USB_CDC_MBIM_NDP16_DSS_SIGN); c = (u8 *)&sign; c[3] = tci; break; default: netif_err(dev, tx_err, dev->net, "unsupported tci=0x%04x\n", tci); goto error; } } spin_lock_bh(&ctx->mtx); skb_out = cdc_ncm_fill_tx_frame(dev, skb, sign); spin_unlock_bh(&ctx->mtx); return skb_out; error: if (skb) dev_kfree_skb_any(skb); return NULL; } /* Some devices are known to send Neighbor Solicitation messages and * require Neighbor Advertisement replies. The IPv6 core will not * respond since IFF_NOARP is set, so we must handle them ourselves. */ static void do_neigh_solicit(struct usbnet *dev, u8 *buf, u16 tci) { struct ipv6hdr *iph = (void *)buf; struct nd_msg *msg = (void *)(iph + 1); struct net_device *netdev; struct inet6_dev *in6_dev; bool is_router; /* we'll only respond to requests from unicast addresses to * our solicited node addresses. */ if (!ipv6_addr_is_solict_mult(&iph->daddr) || !(ipv6_addr_type(&iph->saddr) & IPV6_ADDR_UNICAST)) return; /* need to send the NA on the VLAN dev, if any */ rcu_read_lock(); if (tci) { netdev = __vlan_find_dev_deep_rcu(dev->net, htons(ETH_P_8021Q), tci); if (!netdev) { rcu_read_unlock(); return; } } else { netdev = dev->net; } dev_hold(netdev); rcu_read_unlock(); in6_dev = in6_dev_get(netdev); if (!in6_dev) goto out; is_router = !!in6_dev->cnf.forwarding; in6_dev_put(in6_dev); /* ipv6_stub != NULL if in6_dev_get returned an inet6_dev */ ipv6_stub->ndisc_send_na(netdev, &iph->saddr, &msg->target, is_router /* router */, true /* solicited */, false /* override */, true /* inc_opt */); out: dev_put(netdev); } static bool is_neigh_solicit(u8 *buf, size_t len) { struct ipv6hdr *iph = (void *)buf; struct nd_msg *msg = (void *)(iph + 1); return (len >= sizeof(struct ipv6hdr) + sizeof(struct nd_msg) && iph->nexthdr == IPPROTO_ICMPV6 && msg->icmph.icmp6_code == 0 && msg->icmph.icmp6_type == NDISC_NEIGHBOUR_SOLICITATION); } static struct sk_buff *cdc_mbim_process_dgram(struct usbnet *dev, u8 *buf, size_t len, u16 tci) { __be16 proto = htons(ETH_P_802_3); struct sk_buff *skb = NULL; if (tci < 256 || tci == MBIM_IPS0_VID) { /* IPS session? */ if (len < sizeof(struct iphdr)) goto err; switch (*buf & 0xf0) { case 0x40: proto = htons(ETH_P_IP); break; case 0x60: if (is_neigh_solicit(buf, len)) do_neigh_solicit(dev, buf, tci); proto = htons(ETH_P_IPV6); break; default: goto err; } } skb = netdev_alloc_skb_ip_align(dev->net, len + ETH_HLEN); if (!skb) goto err; /* add an ethernet header */ skb_put(skb, ETH_HLEN); skb_reset_mac_header(skb); eth_hdr(skb)->h_proto = proto; eth_zero_addr(eth_hdr(skb)->h_source); memcpy(eth_hdr(skb)->h_dest, dev->net->dev_addr, ETH_ALEN); /* add datagram */ skb_put_data(skb, buf, len); /* map MBIM session to VLAN */ if (tci) __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), tci); err: return skb; } static int cdc_mbim_rx_fixup(struct usbnet *dev, struct sk_buff *skb_in) { struct sk_buff *skb; struct cdc_mbim_state *info = (void *)&dev->data; struct cdc_ncm_ctx *ctx = info->ctx; int len; int nframes; int x; int offset; struct usb_cdc_ncm_ndp16 *ndp16; struct usb_cdc_ncm_dpe16 *dpe16; int ndpoffset; int loopcount = 50; /* arbitrary max preventing infinite loop */ u32 payload = 0; u8 *c; u16 tci; ndpoffset = cdc_ncm_rx_verify_nth16(ctx, skb_in); if (ndpoffset < 0) goto error; next_ndp: nframes = cdc_ncm_rx_verify_ndp16(skb_in, ndpoffset); if (nframes < 0) goto error; ndp16 = (struct usb_cdc_ncm_ndp16 *)(skb_in->data + ndpoffset); switch (ndp16->dwSignature & cpu_to_le32(0x00ffffff)) { case cpu_to_le32(USB_CDC_MBIM_NDP16_IPS_SIGN): c = (u8 *)&ndp16->dwSignature; tci = c[3]; /* tag IPS<0> packets too if MBIM_IPS0_VID exists */ if (!tci && info->flags & FLAG_IPS0_VLAN) tci = MBIM_IPS0_VID; break; case cpu_to_le32(USB_CDC_MBIM_NDP16_DSS_SIGN): c = (u8 *)&ndp16->dwSignature; tci = c[3] + 256; break; default: netif_dbg(dev, rx_err, dev->net, "unsupported NDP signature <0x%08x>\n", le32_to_cpu(ndp16->dwSignature)); goto err_ndp; } dpe16 = ndp16->dpe16; for (x = 0; x < nframes; x++, dpe16++) { offset = le16_to_cpu(dpe16->wDatagramIndex); len = le16_to_cpu(dpe16->wDatagramLength); /* * CDC NCM ch. 3.7 * All entries after first NULL entry are to be ignored */ if ((offset == 0) || (len == 0)) { if (!x) goto err_ndp; /* empty NTB */ break; } /* sanity checking */ if (((offset + len) > skb_in->len) || (len > ctx->rx_max)) { netif_dbg(dev, rx_err, dev->net, "invalid frame detected (ignored) offset[%u]=%u, length=%u, skb=%p\n", x, offset, len, skb_in); if (!x) goto err_ndp; break; } else { skb = cdc_mbim_process_dgram(dev, skb_in->data + offset, len, tci); if (!skb) goto error; usbnet_skb_return(dev, skb); payload += len; /* count payload bytes in this NTB */ } } err_ndp: /* are there more NDPs to process? */ ndpoffset = le16_to_cpu(ndp16->wNextNdpIndex); if (ndpoffset && loopcount--) goto next_ndp; /* update stats */ ctx->rx_overhead += skb_in->len - payload; ctx->rx_ntbs++; return 1; error: return 0; } static int cdc_mbim_suspend(struct usb_interface *intf, pm_message_t message) { int ret = -ENODEV; struct usbnet *dev = usb_get_intfdata(intf); struct cdc_mbim_state *info = (void *)&dev->data; struct cdc_ncm_ctx *ctx = info->ctx; if (!ctx) goto error; /* * Both usbnet_suspend() and subdriver->suspend() MUST return 0 * in system sleep context, otherwise, the resume callback has * to recover device from previous suspend failure. */ ret = usbnet_suspend(intf, message); if (ret < 0) goto error; if (intf == ctx->control && info->subdriver && info->subdriver->suspend) ret = info->subdriver->suspend(intf, message); if (ret < 0) usbnet_resume(intf); error: return ret; } static int cdc_mbim_resume(struct usb_interface *intf) { int ret = 0; struct usbnet *dev = usb_get_intfdata(intf); struct cdc_mbim_state *info = (void *)&dev->data; struct cdc_ncm_ctx *ctx = info->ctx; bool callsub = (intf == ctx->control && info->subdriver && info->subdriver->resume); if (callsub) ret = info->subdriver->resume(intf); if (ret < 0) goto err; ret = usbnet_resume(intf); if (ret < 0 && callsub) info->subdriver->suspend(intf, PMSG_SUSPEND); err: return ret; } static const struct driver_info cdc_mbim_info = { .description = "CDC MBIM", .flags = FLAG_NO_SETINT | FLAG_MULTI_PACKET | FLAG_WWAN, .bind = cdc_mbim_bind, .unbind = cdc_mbim_unbind, .manage_power = cdc_mbim_manage_power, .rx_fixup = cdc_mbim_rx_fixup, .tx_fixup = cdc_mbim_tx_fixup, }; /* MBIM and NCM devices should not need a ZLP after NTBs with * dwNtbOutMaxSize length. Nevertheless, a number of devices from * different vendor IDs will fail unless we send ZLPs, forcing us * to make this the default. * * This default may cause a performance penalty for spec conforming * devices wanting to take advantage of optimizations possible without * ZLPs. A whitelist is added in an attempt to avoid this for devices * known to conform to the MBIM specification. * * All known devices supporting NCM compatibility mode are also * conforming to the NCM and MBIM specifications. For this reason, the * NCM subclass entry is also in the ZLP whitelist. */ static const struct driver_info cdc_mbim_info_zlp = { .description = "CDC MBIM", .flags = FLAG_NO_SETINT | FLAG_MULTI_PACKET | FLAG_WWAN | FLAG_SEND_ZLP, .bind = cdc_mbim_bind, .unbind = cdc_mbim_unbind, .manage_power = cdc_mbim_manage_power, .rx_fixup = cdc_mbim_rx_fixup, .tx_fixup = cdc_mbim_tx_fixup, }; /* The spefication explicitly allows NDPs to be placed anywhere in the * frame, but some devices fail unless the NDP is placed after the IP * packets. Using the CDC_NCM_FLAG_NDP_TO_END flags to force this * behaviour. * * Note: The current implementation of this feature restricts each NTB * to a single NDP, implying that multiplexed sessions cannot share an * NTB. This might affect performance for multiplexed sessions. */ static const struct driver_info cdc_mbim_info_ndp_to_end = { .description = "CDC MBIM", .flags = FLAG_NO_SETINT | FLAG_MULTI_PACKET | FLAG_WWAN, .bind = cdc_mbim_bind, .unbind = cdc_mbim_unbind, .manage_power = cdc_mbim_manage_power, .rx_fixup = cdc_mbim_rx_fixup, .tx_fixup = cdc_mbim_tx_fixup, .data = CDC_NCM_FLAG_NDP_TO_END, }; /* Some modems (e.g. Telit LE922A6) do not work properly with altsetting * toggle done in cdc_ncm_bind_common. CDC_MBIM_FLAG_AVOID_ALTSETTING_TOGGLE * flag is used to avoid this procedure. */ static const struct driver_info cdc_mbim_info_avoid_altsetting_toggle = { .description = "CDC MBIM", .flags = FLAG_NO_SETINT | FLAG_MULTI_PACKET | FLAG_WWAN | FLAG_SEND_ZLP, .bind = cdc_mbim_bind, .unbind = cdc_mbim_unbind, .manage_power = cdc_mbim_manage_power, .rx_fixup = cdc_mbim_rx_fixup, .tx_fixup = cdc_mbim_tx_fixup, .data = CDC_MBIM_FLAG_AVOID_ALTSETTING_TOGGLE, }; static const struct usb_device_id mbim_devs[] = { /* This duplicate NCM entry is intentional. MBIM devices can * be disguised as NCM by default, and this is necessary to * allow us to bind the correct driver_info to such devices. * * bind() will sort out this for us, selecting the correct * entry and reject the other */ { USB_INTERFACE_INFO(USB_CLASS_COMM, USB_CDC_SUBCLASS_NCM, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&cdc_mbim_info, }, /* ZLP conformance whitelist: All Ericsson MBIM devices */ { USB_VENDOR_AND_INTERFACE_INFO(0x0bdb, USB_CLASS_COMM, USB_CDC_SUBCLASS_MBIM, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&cdc_mbim_info, }, /* Some Huawei devices, ME906s-158 (12d1:15c1) and E3372 * (12d1:157d), are known to fail unless the NDP is placed * after the IP packets. Applying the quirk to all Huawei * devices is broader than necessary, but harmless. */ { USB_VENDOR_AND_INTERFACE_INFO(0x12d1, USB_CLASS_COMM, USB_CDC_SUBCLASS_MBIM, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&cdc_mbim_info_ndp_to_end, }, /* The HP lt4132 (03f0:a31d) is a rebranded Huawei ME906s-158, * therefore it too requires the above "NDP to end" quirk. */ { USB_DEVICE_AND_INTERFACE_INFO(0x03f0, 0xa31d, USB_CLASS_COMM, USB_CDC_SUBCLASS_MBIM, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&cdc_mbim_info_ndp_to_end, }, /* Telit LE922A6 in MBIM composition */ { USB_DEVICE_AND_INTERFACE_INFO(0x1bc7, 0x1041, USB_CLASS_COMM, USB_CDC_SUBCLASS_MBIM, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&cdc_mbim_info_avoid_altsetting_toggle, }, /* Telit LN920 */ { USB_DEVICE_AND_INTERFACE_INFO(0x1bc7, 0x1061, USB_CLASS_COMM, USB_CDC_SUBCLASS_MBIM, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&cdc_mbim_info_avoid_altsetting_toggle, }, /* Telit FN990 */ { USB_DEVICE_AND_INTERFACE_INFO(0x1bc7, 0x1071, USB_CLASS_COMM, USB_CDC_SUBCLASS_MBIM, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&cdc_mbim_info_avoid_altsetting_toggle, }, /* Telit FE990 */ { USB_DEVICE_AND_INTERFACE_INFO(0x1bc7, 0x1081, USB_CLASS_COMM, USB_CDC_SUBCLASS_MBIM, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&cdc_mbim_info_avoid_altsetting_toggle, }, /* default entry */ { USB_INTERFACE_INFO(USB_CLASS_COMM, USB_CDC_SUBCLASS_MBIM, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&cdc_mbim_info_zlp, }, { }, }; MODULE_DEVICE_TABLE(usb, mbim_devs); static struct usb_driver cdc_mbim_driver = { .name = "cdc_mbim", .id_table = mbim_devs, .probe = usbnet_probe, .disconnect = usbnet_disconnect, .suspend = cdc_mbim_suspend, .resume = cdc_mbim_resume, .reset_resume = cdc_mbim_resume, .supports_autosuspend = 1, .disable_hub_initiated_lpm = 1, }; module_usb_driver(cdc_mbim_driver); MODULE_AUTHOR("Greg Suarez <gsuarez@smithmicro.com>"); MODULE_AUTHOR("Bjørn Mork <bjorn@mork.no>"); MODULE_DESCRIPTION("USB CDC MBIM host driver"); MODULE_LICENSE("GPL"); |
2 17 15 2 12 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 | // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2010-2011 EIA Electronics, // Kurt Van Dijck <kurt.van.dijck@eia.be> // Copyright (c) 2010-2011 EIA Electronics, // Pieter Beyens <pieter.beyens@eia.be> // Copyright (c) 2017-2019 Pengutronix, // Marc Kleine-Budde <kernel@pengutronix.de> // Copyright (c) 2017-2019 Pengutronix, // Oleksij Rempel <kernel@pengutronix.de> /* J1939 Address Claiming. * Address Claiming in the kernel * - keeps track of the AC states of ECU's, * - resolves NAME<=>SA taking into account the AC states of ECU's. * * All Address Claim msgs (including host-originated msg) are processed * at the receive path (a sent msg is always received again via CAN echo). * As such, the processing of AC msgs is done in the order on which msgs * are sent on the bus. * * This module doesn't send msgs itself (e.g. replies on Address Claims), * this is the responsibility of a user space application or daemon. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/netdevice.h> #include <linux/skbuff.h> #include "j1939-priv.h" static inline name_t j1939_skb_to_name(const struct sk_buff *skb) { return le64_to_cpup((__le64 *)skb->data); } static inline bool j1939_ac_msg_is_request(struct sk_buff *skb) { struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb); int req_pgn; if (skb->len < 3 || skcb->addr.pgn != J1939_PGN_REQUEST) return false; req_pgn = skb->data[0] | (skb->data[1] << 8) | (skb->data[2] << 16); return req_pgn == J1939_PGN_ADDRESS_CLAIMED; } static int j1939_ac_verify_outgoing(struct j1939_priv *priv, struct sk_buff *skb) { struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb); if (skb->len != 8) { netdev_notice(priv->ndev, "tx address claim with dlc %i\n", skb->len); return -EPROTO; } if (skcb->addr.src_name != j1939_skb_to_name(skb)) { netdev_notice(priv->ndev, "tx address claim with different name\n"); return -EPROTO; } if (skcb->addr.sa == J1939_NO_ADDR) { netdev_notice(priv->ndev, "tx address claim with broadcast sa\n"); return -EPROTO; } /* ac must always be a broadcast */ if (skcb->addr.dst_name || skcb->addr.da != J1939_NO_ADDR) { netdev_notice(priv->ndev, "tx address claim with dest, not broadcast\n"); return -EPROTO; } return 0; } int j1939_ac_fixup(struct j1939_priv *priv, struct sk_buff *skb) { struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb); int ret; u8 addr; /* network mgmt: address claiming msgs */ if (skcb->addr.pgn == J1939_PGN_ADDRESS_CLAIMED) { struct j1939_ecu *ecu; ret = j1939_ac_verify_outgoing(priv, skb); /* return both when failure & when successful */ if (ret < 0) return ret; ecu = j1939_ecu_get_by_name(priv, skcb->addr.src_name); if (!ecu) return -ENODEV; if (ecu->addr != skcb->addr.sa) /* hold further traffic for ecu, remove from parent */ j1939_ecu_unmap(ecu); j1939_ecu_put(ecu); } else if (skcb->addr.src_name) { /* assign source address */ addr = j1939_name_to_addr(priv, skcb->addr.src_name); if (!j1939_address_is_unicast(addr) && !j1939_ac_msg_is_request(skb)) { netdev_notice(priv->ndev, "tx drop: invalid sa for name 0x%016llx\n", skcb->addr.src_name); return -EADDRNOTAVAIL; } skcb->addr.sa = addr; } /* assign destination address */ if (skcb->addr.dst_name) { addr = j1939_name_to_addr(priv, skcb->addr.dst_name); if (!j1939_address_is_unicast(addr)) { netdev_notice(priv->ndev, "tx drop: invalid da for name 0x%016llx\n", skcb->addr.dst_name); return -EADDRNOTAVAIL; } skcb->addr.da = addr; } return 0; } static void j1939_ac_process(struct j1939_priv *priv, struct sk_buff *skb) { struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb); struct j1939_ecu *ecu, *prev; name_t name; if (skb->len != 8) { netdev_notice(priv->ndev, "rx address claim with wrong dlc %i\n", skb->len); return; } name = j1939_skb_to_name(skb); skcb->addr.src_name = name; if (!name) { netdev_notice(priv->ndev, "rx address claim without name\n"); return; } if (!j1939_address_is_valid(skcb->addr.sa)) { netdev_notice(priv->ndev, "rx address claim with broadcast sa\n"); return; } write_lock_bh(&priv->lock); /* Few words on the ECU ref counting: * * First we get an ECU handle, either with * j1939_ecu_get_by_name_locked() (increments the ref counter) * or j1939_ecu_create_locked() (initializes an ECU object * with a ref counter of 1). * * j1939_ecu_unmap_locked() will decrement the ref counter, * but only if the ECU was mapped before. So "ecu" still * belongs to us. * * j1939_ecu_timer_start() will increment the ref counter * before it starts the timer, so we can put the ecu when * leaving this function. */ ecu = j1939_ecu_get_by_name_locked(priv, name); if (ecu && ecu->addr == skcb->addr.sa) { /* The ISO 11783-5 standard, in "4.5.2 - Address claim * requirements", states: * d) No CF shall begin, or resume, transmission on the * network until 250 ms after it has successfully claimed * an address except when responding to a request for * address-claimed. * * But "Figure 6" and "Figure 7" in "4.5.4.2 - Address-claim * prioritization" show that the CF begins the transmission * after 250 ms from the first AC (address-claimed) message * even if it sends another AC message during that time window * to resolve the address contention with another CF. * * As stated in "4.4.2.3 - Address-claimed message": * In order to successfully claim an address, the CF sending * an address claimed message shall not receive a contending * claim from another CF for at least 250 ms. * * As stated in "4.4.3.2 - NAME management (NM) message": * 1) A commanding CF can * d) request that a CF with a specified NAME transmit * the address-claimed message with its current NAME. * 2) A target CF shall * d) send an address-claimed message in response to a * request for a matching NAME * * Taking the above arguments into account, the 250 ms wait is * requested only during network initialization. * * Do not restart the timer on AC message if both the NAME and * the address match and so if the address has already been * claimed (timer has expired) or the AC message has been sent * to resolve the contention with another CF (timer is still * running). */ goto out_ecu_put; } if (!ecu && j1939_address_is_unicast(skcb->addr.sa)) ecu = j1939_ecu_create_locked(priv, name); if (IS_ERR_OR_NULL(ecu)) goto out_unlock_bh; /* cancel pending (previous) address claim */ j1939_ecu_timer_cancel(ecu); if (j1939_address_is_idle(skcb->addr.sa)) { j1939_ecu_unmap_locked(ecu); goto out_ecu_put; } /* save new addr */ if (ecu->addr != skcb->addr.sa) j1939_ecu_unmap_locked(ecu); ecu->addr = skcb->addr.sa; prev = j1939_ecu_get_by_addr_locked(priv, skcb->addr.sa); if (prev) { if (ecu->name > prev->name) { j1939_ecu_unmap_locked(ecu); j1939_ecu_put(prev); goto out_ecu_put; } else { /* kick prev if less or equal */ j1939_ecu_unmap_locked(prev); j1939_ecu_put(prev); } } j1939_ecu_timer_start(ecu); out_ecu_put: j1939_ecu_put(ecu); out_unlock_bh: write_unlock_bh(&priv->lock); } void j1939_ac_recv(struct j1939_priv *priv, struct sk_buff *skb) { struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb); struct j1939_ecu *ecu; /* network mgmt */ if (skcb->addr.pgn == J1939_PGN_ADDRESS_CLAIMED) { j1939_ac_process(priv, skb); } else if (j1939_address_is_unicast(skcb->addr.sa)) { /* assign source name */ ecu = j1939_ecu_get_by_addr(priv, skcb->addr.sa); if (ecu) { skcb->addr.src_name = ecu->name; j1939_ecu_put(ecu); } } /* assign destination name */ ecu = j1939_ecu_get_by_addr(priv, skcb->addr.da); if (ecu) { skcb->addr.dst_name = ecu->name; j1939_ecu_put(ecu); } } |
1429 120 473 890 39 528 280 485 27 931 11 3 2 90 9 9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (c) 2020 Christoph Hellwig. * * Support for "universal" pointers that can point to either kernel or userspace * memory. */ #ifndef _LINUX_SOCKPTR_H #define _LINUX_SOCKPTR_H #include <linux/slab.h> #include <linux/uaccess.h> typedef struct { union { void *kernel; void __user *user; }; bool is_kernel : 1; } sockptr_t; static inline bool sockptr_is_kernel(sockptr_t sockptr) { return sockptr.is_kernel; } static inline sockptr_t KERNEL_SOCKPTR(void *p) { return (sockptr_t) { .kernel = p, .is_kernel = true }; } static inline sockptr_t USER_SOCKPTR(void __user *p) { return (sockptr_t) { .user = p }; } static inline bool sockptr_is_null(sockptr_t sockptr) { if (sockptr_is_kernel(sockptr)) return !sockptr.kernel; return !sockptr.user; } static inline int copy_from_sockptr_offset(void *dst, sockptr_t src, size_t offset, size_t size) { if (!sockptr_is_kernel(src)) return copy_from_user(dst, src.user + offset, size); memcpy(dst, src.kernel + offset, size); return 0; } static inline int copy_from_sockptr(void *dst, sockptr_t src, size_t size) { return copy_from_sockptr_offset(dst, src, 0, size); } static inline int copy_struct_from_sockptr(void *dst, size_t ksize, sockptr_t src, size_t usize) { size_t size = min(ksize, usize); size_t rest = max(ksize, usize) - size; if (!sockptr_is_kernel(src)) return copy_struct_from_user(dst, ksize, src.user, size); if (usize < ksize) { memset(dst + size, 0, rest); } else if (usize > ksize) { char *p = src.kernel; while (rest--) { if (*p++) return -E2BIG; } } memcpy(dst, src.kernel, size); return 0; } static inline int copy_to_sockptr_offset(sockptr_t dst, size_t offset, const void *src, size_t size) { if (!sockptr_is_kernel(dst)) return copy_to_user(dst.user + offset, src, size); memcpy(dst.kernel + offset, src, size); return 0; } static inline int copy_to_sockptr(sockptr_t dst, const void *src, size_t size) { return copy_to_sockptr_offset(dst, 0, src, size); } static inline void *memdup_sockptr(sockptr_t src, size_t len) { void *p = kmalloc_track_caller(len, GFP_USER | __GFP_NOWARN); if (!p) return ERR_PTR(-ENOMEM); if (copy_from_sockptr(p, src, len)) { kfree(p); return ERR_PTR(-EFAULT); } return p; } static inline void *memdup_sockptr_nul(sockptr_t src, size_t len) { char *p = kmalloc_track_caller(len + 1, GFP_KERNEL); if (!p) return ERR_PTR(-ENOMEM); if (copy_from_sockptr(p, src, len)) { kfree(p); return ERR_PTR(-EFAULT); } p[len] = '\0'; return p; } static inline long strncpy_from_sockptr(char *dst, sockptr_t src, size_t count) { if (sockptr_is_kernel(src)) { size_t len = min(strnlen(src.kernel, count - 1) + 1, count); memcpy(dst, src.kernel, len); return len; } return strncpy_from_user(dst, src.user, count); } static inline int check_zeroed_sockptr(sockptr_t src, size_t offset, size_t size) { if (!sockptr_is_kernel(src)) return check_zeroed_user(src.user + offset, size); return memchr_inv(src.kernel + offset, 0, size) == NULL; } #endif /* _LINUX_SOCKPTR_H */ |
33 27 6 33 11 44 44 44 44 44 44 33 33 6 6 6 3 6 6 6 6 27 27 27 27 26 33 19 9 27 26 27 33 33 33 6 33 4 29 33 33 33 33 26 1 25 25 25 25 47 47 47 47 26 46 46 46 46 46 46 46 9 9 4 5 9 9 9 9 9 46 46 46 1 25 34 45 45 1 38 25 44 4 46 46 46 46 46 46 46 46 46 46 46 46 46 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. * All Rights Reserved. */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_ialloc.h" #include "xfs_ialloc_btree.h" #include "xfs_alloc.h" #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_bmap.h" #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_icreate_item.h" #include "xfs_icache.h" #include "xfs_trace.h" #include "xfs_log.h" #include "xfs_rmap.h" #include "xfs_ag.h" /* * Lookup a record by ino in the btree given by cur. */ int /* error */ xfs_inobt_lookup( struct xfs_btree_cur *cur, /* btree cursor */ xfs_agino_t ino, /* starting inode of chunk */ xfs_lookup_t dir, /* <=, >=, == */ int *stat) /* success/failure */ { cur->bc_rec.i.ir_startino = ino; cur->bc_rec.i.ir_holemask = 0; cur->bc_rec.i.ir_count = 0; cur->bc_rec.i.ir_freecount = 0; cur->bc_rec.i.ir_free = 0; return xfs_btree_lookup(cur, dir, stat); } /* * Update the record referred to by cur to the value given. * This either works (return 0) or gets an EFSCORRUPTED error. */ STATIC int /* error */ xfs_inobt_update( struct xfs_btree_cur *cur, /* btree cursor */ xfs_inobt_rec_incore_t *irec) /* btree record */ { union xfs_btree_rec rec; rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino); if (xfs_has_sparseinodes(cur->bc_mp)) { rec.inobt.ir_u.sp.ir_holemask = cpu_to_be16(irec->ir_holemask); rec.inobt.ir_u.sp.ir_count = irec->ir_count; rec.inobt.ir_u.sp.ir_freecount = irec->ir_freecount; } else { /* ir_holemask/ir_count not supported on-disk */ rec.inobt.ir_u.f.ir_freecount = cpu_to_be32(irec->ir_freecount); } rec.inobt.ir_free = cpu_to_be64(irec->ir_free); return xfs_btree_update(cur, &rec); } /* Convert on-disk btree record to incore inobt record. */ void xfs_inobt_btrec_to_irec( struct xfs_mount *mp, const union xfs_btree_rec *rec, struct xfs_inobt_rec_incore *irec) { irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino); if (xfs_has_sparseinodes(mp)) { irec->ir_holemask = be16_to_cpu(rec->inobt.ir_u.sp.ir_holemask); irec->ir_count = rec->inobt.ir_u.sp.ir_count; irec->ir_freecount = rec->inobt.ir_u.sp.ir_freecount; } else { /* * ir_holemask/ir_count not supported on-disk. Fill in hardcoded * values for full inode chunks. */ irec->ir_holemask = XFS_INOBT_HOLEMASK_FULL; irec->ir_count = XFS_INODES_PER_CHUNK; irec->ir_freecount = be32_to_cpu(rec->inobt.ir_u.f.ir_freecount); } irec->ir_free = be64_to_cpu(rec->inobt.ir_free); } /* Compute the freecount of an incore inode record. */ uint8_t xfs_inobt_rec_freecount( const struct xfs_inobt_rec_incore *irec) { uint64_t realfree = irec->ir_free; if (xfs_inobt_issparse(irec->ir_holemask)) realfree &= xfs_inobt_irec_to_allocmask(irec); return hweight64(realfree); } /* Simple checks for inode records. */ xfs_failaddr_t xfs_inobt_check_irec( struct xfs_perag *pag, const struct xfs_inobt_rec_incore *irec) { /* Record has to be properly aligned within the AG. */ if (!xfs_verify_agino(pag, irec->ir_startino)) return __this_address; if (!xfs_verify_agino(pag, irec->ir_startino + XFS_INODES_PER_CHUNK - 1)) return __this_address; if (irec->ir_count < XFS_INODES_PER_HOLEMASK_BIT || irec->ir_count > XFS_INODES_PER_CHUNK) return __this_address; if (irec->ir_freecount > XFS_INODES_PER_CHUNK) return __this_address; if (xfs_inobt_rec_freecount(irec) != irec->ir_freecount) return __this_address; return NULL; } static inline int xfs_inobt_complain_bad_rec( struct xfs_btree_cur *cur, xfs_failaddr_t fa, const struct xfs_inobt_rec_incore *irec) { struct xfs_mount *mp = cur->bc_mp; xfs_warn(mp, "%s Inode BTree record corruption in AG %d detected at %pS!", cur->bc_btnum == XFS_BTNUM_INO ? "Used" : "Free", cur->bc_ag.pag->pag_agno, fa); xfs_warn(mp, "start inode 0x%x, count 0x%x, free 0x%x freemask 0x%llx, holemask 0x%x", irec->ir_startino, irec->ir_count, irec->ir_freecount, irec->ir_free, irec->ir_holemask); return -EFSCORRUPTED; } /* * Get the data from the pointed-to record. */ int xfs_inobt_get_rec( struct xfs_btree_cur *cur, struct xfs_inobt_rec_incore *irec, int *stat) { struct xfs_mount *mp = cur->bc_mp; union xfs_btree_rec *rec; xfs_failaddr_t fa; int error; error = xfs_btree_get_rec(cur, &rec, stat); if (error || *stat == 0) return error; xfs_inobt_btrec_to_irec(mp, rec, irec); fa = xfs_inobt_check_irec(cur->bc_ag.pag, irec); if (fa) return xfs_inobt_complain_bad_rec(cur, fa, irec); return 0; } /* * Insert a single inobt record. Cursor must already point to desired location. */ int xfs_inobt_insert_rec( struct xfs_btree_cur *cur, uint16_t holemask, uint8_t count, int32_t freecount, xfs_inofree_t free, int *stat) { cur->bc_rec.i.ir_holemask = holemask; cur->bc_rec.i.ir_count = count; cur->bc_rec.i.ir_freecount = freecount; cur->bc_rec.i.ir_free = free; return xfs_btree_insert(cur, stat); } /* * Insert records describing a newly allocated inode chunk into the inobt. */ STATIC int xfs_inobt_insert( struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp, xfs_agino_t newino, xfs_agino_t newlen, xfs_btnum_t btnum) { struct xfs_btree_cur *cur; xfs_agino_t thisino; int i; int error; cur = xfs_inobt_init_cursor(pag, tp, agbp, btnum); for (thisino = newino; thisino < newino + newlen; thisino += XFS_INODES_PER_CHUNK) { error = xfs_inobt_lookup(cur, thisino, XFS_LOOKUP_EQ, &i); if (error) { xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); return error; } ASSERT(i == 0); error = xfs_inobt_insert_rec(cur, XFS_INOBT_HOLEMASK_FULL, XFS_INODES_PER_CHUNK, XFS_INODES_PER_CHUNK, XFS_INOBT_ALL_FREE, &i); if (error) { xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); return error; } ASSERT(i == 1); } xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); return 0; } /* * Verify that the number of free inodes in the AGI is correct. */ #ifdef DEBUG static int xfs_check_agi_freecount( struct xfs_btree_cur *cur) { if (cur->bc_nlevels == 1) { xfs_inobt_rec_incore_t rec; int freecount = 0; int error; int i; error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i); if (error) return error; do { error = xfs_inobt_get_rec(cur, &rec, &i); if (error) return error; if (i) { freecount += rec.ir_freecount; error = xfs_btree_increment(cur, 0, &i); if (error) return error; } } while (i == 1); if (!xfs_is_shutdown(cur->bc_mp)) ASSERT(freecount == cur->bc_ag.pag->pagi_freecount); } return 0; } #else #define xfs_check_agi_freecount(cur) 0 #endif /* * Initialise a new set of inodes. When called without a transaction context * (e.g. from recovery) we initiate a delayed write of the inode buffers rather * than logging them (which in a transaction context puts them into the AIL * for writeback rather than the xfsbufd queue). */ int xfs_ialloc_inode_init( struct xfs_mount *mp, struct xfs_trans *tp, struct list_head *buffer_list, int icount, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_agblock_t length, unsigned int gen) { struct xfs_buf *fbuf; struct xfs_dinode *free; int nbufs; int version; int i, j; xfs_daddr_t d; xfs_ino_t ino = 0; int error; /* * Loop over the new block(s), filling in the inodes. For small block * sizes, manipulate the inodes in buffers which are multiples of the * blocks size. */ nbufs = length / M_IGEO(mp)->blocks_per_cluster; /* * Figure out what version number to use in the inodes we create. If * the superblock version has caught up to the one that supports the new * inode format, then use the new inode version. Otherwise use the old * version so that old kernels will continue to be able to use the file * system. * * For v3 inodes, we also need to write the inode number into the inode, * so calculate the first inode number of the chunk here as * XFS_AGB_TO_AGINO() only works within a filesystem block, not * across multiple filesystem blocks (such as a cluster) and so cannot * be used in the cluster buffer loop below. * * Further, because we are writing the inode directly into the buffer * and calculating a CRC on the entire inode, we have ot log the entire * inode so that the entire range the CRC covers is present in the log. * That means for v3 inode we log the entire buffer rather than just the * inode cores. */ if (xfs_has_v3inodes(mp)) { version = 3; ino = XFS_AGINO_TO_INO(mp, agno, XFS_AGB_TO_AGINO(mp, agbno)); /* * log the initialisation that is about to take place as an * logical operation. This means the transaction does not * need to log the physical changes to the inode buffers as log * recovery will know what initialisation is actually needed. * Hence we only need to log the buffers as "ordered" buffers so * they track in the AIL as if they were physically logged. */ if (tp) xfs_icreate_log(tp, agno, agbno, icount, mp->m_sb.sb_inodesize, length, gen); } else version = 2; for (j = 0; j < nbufs; j++) { /* * Get the block. */ d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * M_IGEO(mp)->blocks_per_cluster)); error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize * M_IGEO(mp)->blocks_per_cluster, XBF_UNMAPPED, &fbuf); if (error) return error; /* Initialize the inode buffers and log them appropriately. */ fbuf->b_ops = &xfs_inode_buf_ops; xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length)); for (i = 0; i < M_IGEO(mp)->inodes_per_cluster; i++) { int ioffset = i << mp->m_sb.sb_inodelog; free = xfs_make_iptr(mp, fbuf, i); free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); free->di_version = version; free->di_gen = cpu_to_be32(gen); free->di_next_unlinked = cpu_to_be32(NULLAGINO); if (version == 3) { free->di_ino = cpu_to_be64(ino); ino++; uuid_copy(&free->di_uuid, &mp->m_sb.sb_meta_uuid); xfs_dinode_calc_crc(mp, free); } else if (tp) { /* just log the inode core */ xfs_trans_log_buf(tp, fbuf, ioffset, ioffset + XFS_DINODE_SIZE(mp) - 1); } } if (tp) { /* * Mark the buffer as an inode allocation buffer so it * sticks in AIL at the point of this allocation * transaction. This ensures the they are on disk before * the tail of the log can be moved past this * transaction (i.e. by preventing relogging from moving * it forward in the log). */ xfs_trans_inode_alloc_buf(tp, fbuf); if (version == 3) { /* * Mark the buffer as ordered so that they are * not physically logged in the transaction but * still tracked in the AIL as part of the * transaction and pin the log appropriately. */ xfs_trans_ordered_buf(tp, fbuf); } } else { fbuf->b_flags |= XBF_DONE; xfs_buf_delwri_queue(fbuf, buffer_list); xfs_buf_relse(fbuf); } } return 0; } /* * Align startino and allocmask for a recently allocated sparse chunk such that * they are fit for insertion (or merge) into the on-disk inode btrees. * * Background: * * When enabled, sparse inode support increases the inode alignment from cluster * size to inode chunk size. This means that the minimum range between two * non-adjacent inode records in the inobt is large enough for a full inode * record. This allows for cluster sized, cluster aligned block allocation * without need to worry about whether the resulting inode record overlaps with * another record in the tree. Without this basic rule, we would have to deal * with the consequences of overlap by potentially undoing recent allocations in * the inode allocation codepath. * * Because of this alignment rule (which is enforced on mount), there are two * inobt possibilities for newly allocated sparse chunks. One is that the * aligned inode record for the chunk covers a range of inodes not already * covered in the inobt (i.e., it is safe to insert a new sparse record). The * other is that a record already exists at the aligned startino that considers * the newly allocated range as sparse. In the latter case, record content is * merged in hope that sparse inode chunks fill to full chunks over time. */ STATIC void xfs_align_sparse_ino( struct xfs_mount *mp, xfs_agino_t *startino, uint16_t *allocmask) { xfs_agblock_t agbno; xfs_agblock_t mod; int offset; agbno = XFS_AGINO_TO_AGBNO(mp, *startino); mod = agbno % mp->m_sb.sb_inoalignmt; if (!mod) return; /* calculate the inode offset and align startino */ offset = XFS_AGB_TO_AGINO(mp, mod); *startino -= offset; /* * Since startino has been aligned down, left shift allocmask such that * it continues to represent the same physical inodes relative to the * new startino. */ *allocmask <<= offset / XFS_INODES_PER_HOLEMASK_BIT; } /* * Determine whether the source inode record can merge into the target. Both * records must be sparse, the inode ranges must match and there must be no * allocation overlap between the records. */ STATIC bool __xfs_inobt_can_merge( struct xfs_inobt_rec_incore *trec, /* tgt record */ struct xfs_inobt_rec_incore *srec) /* src record */ { uint64_t talloc; uint64_t salloc; /* records must cover the same inode range */ if (trec->ir_startino != srec->ir_startino) return false; /* both records must be sparse */ if (!xfs_inobt_issparse(trec->ir_holemask) || !xfs_inobt_issparse(srec->ir_holemask)) return false; /* both records must track some inodes */ if (!trec->ir_count || !srec->ir_count) return false; /* can't exceed capacity of a full record */ if (trec->ir_count + srec->ir_count > XFS_INODES_PER_CHUNK) return false; /* verify there is no allocation overlap */ talloc = xfs_inobt_irec_to_allocmask(trec); salloc = xfs_inobt_irec_to_allocmask(srec); if (talloc & salloc) return false; return true; } /* * Merge the source inode record into the target. The caller must call * __xfs_inobt_can_merge() to ensure the merge is valid. */ STATIC void __xfs_inobt_rec_merge( struct xfs_inobt_rec_incore *trec, /* target */ struct xfs_inobt_rec_incore *srec) /* src */ { ASSERT(trec->ir_startino == srec->ir_startino); /* combine the counts */ trec->ir_count += srec->ir_count; trec->ir_freecount += srec->ir_freecount; /* * Merge the holemask and free mask. For both fields, 0 bits refer to * allocated inodes. We combine the allocated ranges with bitwise AND. */ trec->ir_holemask &= srec->ir_holemask; trec->ir_free &= srec->ir_free; } /* * Insert a new sparse inode chunk into the associated inode btree. The inode * record for the sparse chunk is pre-aligned to a startino that should match * any pre-existing sparse inode record in the tree. This allows sparse chunks * to fill over time. * * This function supports two modes of handling preexisting records depending on * the merge flag. If merge is true, the provided record is merged with the * existing record and updated in place. The merged record is returned in nrec. * If merge is false, an existing record is replaced with the provided record. * If no preexisting record exists, the provided record is always inserted. * * It is considered corruption if a merge is requested and not possible. Given * the sparse inode alignment constraints, this should never happen. */ STATIC int xfs_inobt_insert_sprec( struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp, int btnum, struct xfs_inobt_rec_incore *nrec, /* in/out: new/merged rec. */ bool merge) /* merge or replace */ { struct xfs_mount *mp = pag->pag_mount; struct xfs_btree_cur *cur; int error; int i; struct xfs_inobt_rec_incore rec; cur = xfs_inobt_init_cursor(pag, tp, agbp, btnum); /* the new record is pre-aligned so we know where to look */ error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i); if (error) goto error; /* if nothing there, insert a new record and return */ if (i == 0) { error = xfs_inobt_insert_rec(cur, nrec->ir_holemask, nrec->ir_count, nrec->ir_freecount, nrec->ir_free, &i); if (error) goto error; if (XFS_IS_CORRUPT(mp, i != 1)) { error = -EFSCORRUPTED; goto error; } goto out; } /* * A record exists at this startino. Merge or replace the record * depending on what we've been asked to do. */ if (merge) { error = xfs_inobt_get_rec(cur, &rec, &i); if (error) goto error; if (XFS_IS_CORRUPT(mp, i != 1)) { error = -EFSCORRUPTED; goto error; } if (XFS_IS_CORRUPT(mp, rec.ir_startino != nrec->ir_startino)) { error = -EFSCORRUPTED; goto error; } /* * This should never fail. If we have coexisting records that * cannot merge, something is seriously wrong. */ if (XFS_IS_CORRUPT(mp, !__xfs_inobt_can_merge(nrec, &rec))) { error = -EFSCORRUPTED; goto error; } trace_xfs_irec_merge_pre(mp, pag->pag_agno, rec.ir_startino, rec.ir_holemask, nrec->ir_startino, nrec->ir_holemask); /* merge to nrec to output the updated record */ __xfs_inobt_rec_merge(nrec, &rec); trace_xfs_irec_merge_post(mp, pag->pag_agno, nrec->ir_startino, nrec->ir_holemask); error = xfs_inobt_rec_check_count(mp, nrec); if (error) goto error; } error = xfs_inobt_update(cur, nrec); if (error) goto error; out: xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); return 0; error: xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); return error; } /* * Allocate new inodes in the allocation group specified by agbp. Returns 0 if * inodes were allocated in this AG; -EAGAIN if there was no space in this AG so * the caller knows it can try another AG, a hard -ENOSPC when over the maximum * inode count threshold, or the usual negative error code for other errors. */ STATIC int xfs_ialloc_ag_alloc( struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp) { struct xfs_agi *agi; struct xfs_alloc_arg args; int error; xfs_agino_t newino; /* new first inode's number */ xfs_agino_t newlen; /* new number of inodes */ int isaligned = 0; /* inode allocation at stripe */ /* unit boundary */ /* init. to full chunk */ struct xfs_inobt_rec_incore rec; struct xfs_ino_geometry *igeo = M_IGEO(tp->t_mountp); uint16_t allocmask = (uint16_t) -1; int do_sparse = 0; memset(&args, 0, sizeof(args)); args.tp = tp; args.mp = tp->t_mountp; args.fsbno = NULLFSBLOCK; args.oinfo = XFS_RMAP_OINFO_INODES; args.pag = pag; #ifdef DEBUG /* randomly do sparse inode allocations */ if (xfs_has_sparseinodes(tp->t_mountp) && igeo->ialloc_min_blks < igeo->ialloc_blks) do_sparse = get_random_u32_below(2); #endif /* * Locking will ensure that we don't have two callers in here * at one time. */ newlen = igeo->ialloc_inos; if (igeo->maxicount && percpu_counter_read_positive(&args.mp->m_icount) + newlen > igeo->maxicount) return -ENOSPC; args.minlen = args.maxlen = igeo->ialloc_blks; /* * First try to allocate inodes contiguous with the last-allocated * chunk of inodes. If the filesystem is striped, this will fill * an entire stripe unit with inodes. */ agi = agbp->b_addr; newino = be32_to_cpu(agi->agi_newino); args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) + igeo->ialloc_blks; if (do_sparse) goto sparse_alloc; if (likely(newino != NULLAGINO && (args.agbno < be32_to_cpu(agi->agi_length)))) { args.prod = 1; /* * We need to take into account alignment here to ensure that * we don't modify the free list if we fail to have an exact * block. If we don't have an exact match, and every oher * attempt allocation attempt fails, we'll end up cancelling * a dirty transaction and shutting down. * * For an exact allocation, alignment must be 1, * however we need to take cluster alignment into account when * fixing up the freelist. Use the minalignslop field to * indicate that extra blocks might be required for alignment, * but not to use them in the actual exact allocation. */ args.alignment = 1; args.minalignslop = igeo->cluster_align - 1; /* Allow space for the inode btree to split. */ args.minleft = igeo->inobt_maxlevels; error = xfs_alloc_vextent_exact_bno(&args, XFS_AGB_TO_FSB(args.mp, pag->pag_agno, args.agbno)); if (error) return error; /* * This request might have dirtied the transaction if the AG can * satisfy the request, but the exact block was not available. * If the allocation did fail, subsequent requests will relax * the exact agbno requirement and increase the alignment * instead. It is critical that the total size of the request * (len + alignment + slop) does not increase from this point * on, so reset minalignslop to ensure it is not included in * subsequent requests. */ args.minalignslop = 0; } if (unlikely(args.fsbno == NULLFSBLOCK)) { /* * Set the alignment for the allocation. * If stripe alignment is turned on then align at stripe unit * boundary. * If the cluster size is smaller than a filesystem block * then we're doing I/O for inodes in filesystem block size * pieces, so don't need alignment anyway. */ isaligned = 0; if (igeo->ialloc_align) { ASSERT(!xfs_has_noalign(args.mp)); args.alignment = args.mp->m_dalign; isaligned = 1; } else args.alignment = igeo->cluster_align; /* * Allocate a fixed-size extent of inodes. */ args.prod = 1; /* * Allow space for the inode btree to split. */ args.minleft = igeo->inobt_maxlevels; error = xfs_alloc_vextent_near_bno(&args, XFS_AGB_TO_FSB(args.mp, pag->pag_agno, be32_to_cpu(agi->agi_root))); if (error) return error; } /* * If stripe alignment is turned on, then try again with cluster * alignment. */ if (isaligned && args.fsbno == NULLFSBLOCK) { args.alignment = igeo->cluster_align; error = xfs_alloc_vextent_near_bno(&args, XFS_AGB_TO_FSB(args.mp, pag->pag_agno, be32_to_cpu(agi->agi_root))); if (error) return error; } /* * Finally, try a sparse allocation if the filesystem supports it and * the sparse allocation length is smaller than a full chunk. */ if (xfs_has_sparseinodes(args.mp) && igeo->ialloc_min_blks < igeo->ialloc_blks && args.fsbno == NULLFSBLOCK) { sparse_alloc: args.alignment = args.mp->m_sb.sb_spino_align; args.prod = 1; args.minlen = igeo->ialloc_min_blks; args.maxlen = args.minlen; /* * The inode record will be aligned to full chunk size. We must * prevent sparse allocation from AG boundaries that result in * invalid inode records, such as records that start at agbno 0 * or extend beyond the AG. * * Set min agbno to the first aligned, non-zero agbno and max to * the last aligned agbno that is at least one full chunk from * the end of the AG. */ args.min_agbno = args.mp->m_sb.sb_inoalignmt; args.max_agbno = round_down(args.mp->m_sb.sb_agblocks, args.mp->m_sb.sb_inoalignmt) - igeo->ialloc_blks; error = xfs_alloc_vextent_near_bno(&args, XFS_AGB_TO_FSB(args.mp, pag->pag_agno, be32_to_cpu(agi->agi_root))); if (error) return error; newlen = XFS_AGB_TO_AGINO(args.mp, args.len); ASSERT(newlen <= XFS_INODES_PER_CHUNK); allocmask = (1 << (newlen / XFS_INODES_PER_HOLEMASK_BIT)) - 1; } if (args.fsbno == NULLFSBLOCK) return -EAGAIN; ASSERT(args.len == args.minlen); /* * Stamp and write the inode buffers. * * Seed the new inode cluster with a random generation number. This * prevents short-term reuse of generation numbers if a chunk is * freed and then immediately reallocated. We use random numbers * rather than a linear progression to prevent the next generation * number from being easily guessable. */ error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, pag->pag_agno, args.agbno, args.len, get_random_u32()); if (error) return error; /* * Convert the results. */ newino = XFS_AGB_TO_AGINO(args.mp, args.agbno); if (xfs_inobt_issparse(~allocmask)) { /* * We've allocated a sparse chunk. Align the startino and mask. */ xfs_align_sparse_ino(args.mp, &newino, &allocmask); rec.ir_startino = newino; rec.ir_holemask = ~allocmask; rec.ir_count = newlen; rec.ir_freecount = newlen; rec.ir_free = XFS_INOBT_ALL_FREE; /* * Insert the sparse record into the inobt and allow for a merge * if necessary. If a merge does occur, rec is updated to the * merged record. */ error = xfs_inobt_insert_sprec(pag, tp, agbp, XFS_BTNUM_INO, &rec, true); if (error == -EFSCORRUPTED) { xfs_alert(args.mp, "invalid sparse inode record: ino 0x%llx holemask 0x%x count %u", XFS_AGINO_TO_INO(args.mp, pag->pag_agno, rec.ir_startino), rec.ir_holemask, rec.ir_count); xfs_force_shutdown(args.mp, SHUTDOWN_CORRUPT_INCORE); } if (error) return error; /* * We can't merge the part we've just allocated as for the inobt * due to finobt semantics. The original record may or may not * exist independent of whether physical inodes exist in this * sparse chunk. * * We must update the finobt record based on the inobt record. * rec contains the fully merged and up to date inobt record * from the previous call. Set merge false to replace any * existing record with this one. */ if (xfs_has_finobt(args.mp)) { error = xfs_inobt_insert_sprec(pag, tp, agbp, XFS_BTNUM_FINO, &rec, false); if (error) return error; } } else { /* full chunk - insert new records to both btrees */ error = xfs_inobt_insert(pag, tp, agbp, newino, newlen, XFS_BTNUM_INO); if (error) return error; if (xfs_has_finobt(args.mp)) { error = xfs_inobt_insert(pag, tp, agbp, newino, newlen, XFS_BTNUM_FINO); if (error) return error; } } /* * Update AGI counts and newino. */ be32_add_cpu(&agi->agi_count, newlen); be32_add_cpu(&agi->agi_freecount, newlen); pag->pagi_freecount += newlen; pag->pagi_count += newlen; agi->agi_newino = cpu_to_be32(newino); /* * Log allocation group header fields */ xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT | XFS_AGI_NEWINO); /* * Modify/log superblock values for inode count and inode free count. */ xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, (long)newlen); xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, (long)newlen); return 0; } /* * Try to retrieve the next record to the left/right from the current one. */ STATIC int xfs_ialloc_next_rec( struct xfs_btree_cur *cur, xfs_inobt_rec_incore_t *rec, int *done, int left) { int error; int i; if (left) error = xfs_btree_decrement(cur, 0, &i); else error = xfs_btree_increment(cur, 0, &i); if (error) return error; *done = !i; if (i) { error = xfs_inobt_get_rec(cur, rec, &i); if (error) return error; if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) return -EFSCORRUPTED; } return 0; } STATIC int xfs_ialloc_get_rec( struct xfs_btree_cur *cur, xfs_agino_t agino, xfs_inobt_rec_incore_t *rec, int *done) { int error; int i; error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_EQ, &i); if (error) return error; *done = !i; if (i) { error = xfs_inobt_get_rec(cur, rec, &i); if (error) return error; if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) return -EFSCORRUPTED; } return 0; } /* * Return the offset of the first free inode in the record. If the inode chunk * is sparsely allocated, we convert the record holemask to inode granularity * and mask off the unallocated regions from the inode free mask. */ STATIC int xfs_inobt_first_free_inode( struct xfs_inobt_rec_incore *rec) { xfs_inofree_t realfree; /* if there are no holes, return the first available offset */ if (!xfs_inobt_issparse(rec->ir_holemask)) return xfs_lowbit64(rec->ir_free); realfree = xfs_inobt_irec_to_allocmask(rec); realfree &= rec->ir_free; return xfs_lowbit64(realfree); } /* * Allocate an inode using the inobt-only algorithm. */ STATIC int xfs_dialloc_ag_inobt( struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp, xfs_ino_t parent, xfs_ino_t *inop) { struct xfs_mount *mp = tp->t_mountp; struct xfs_agi *agi = agbp->b_addr; xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent); xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent); struct xfs_btree_cur *cur, *tcur; struct xfs_inobt_rec_incore rec, trec; xfs_ino_t ino; int error; int offset; int i, j; int searchdistance = 10; ASSERT(xfs_perag_initialised_agi(pag)); ASSERT(xfs_perag_allows_inodes(pag)); ASSERT(pag->pagi_freecount > 0); restart_pagno: cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_INO); /* * If pagino is 0 (this is the root inode allocation) use newino. * This must work because we've just allocated some. */ if (!pagino) pagino = be32_to_cpu(agi->agi_newino); error = xfs_check_agi_freecount(cur); if (error) goto error0; /* * If in the same AG as the parent, try to get near the parent. */ if (pagno == pag->pag_agno) { int doneleft; /* done, to the left */ int doneright; /* done, to the right */ error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i); if (error) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { error = -EFSCORRUPTED; goto error0; } error = xfs_inobt_get_rec(cur, &rec, &j); if (error) goto error0; if (XFS_IS_CORRUPT(mp, j != 1)) { error = -EFSCORRUPTED; goto error0; } if (rec.ir_freecount > 0) { /* * Found a free inode in the same chunk * as the parent, done. */ goto alloc_inode; } /* * In the same AG as parent, but parent's chunk is full. */ /* duplicate the cursor, search left & right simultaneously */ error = xfs_btree_dup_cursor(cur, &tcur); if (error) goto error0; /* * Skip to last blocks looked up if same parent inode. */ if (pagino != NULLAGINO && pag->pagl_pagino == pagino && pag->pagl_leftrec != NULLAGINO && pag->pagl_rightrec != NULLAGINO) { error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec, &trec, &doneleft); if (error) goto error1; error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec, &rec, &doneright); if (error) goto error1; } else { /* search left with tcur, back up 1 record */ error = xfs_ialloc_next_rec(tcur, &trec, &doneleft, 1); if (error) goto error1; /* search right with cur, go forward 1 record. */ error = xfs_ialloc_next_rec(cur, &rec, &doneright, 0); if (error) goto error1; } /* * Loop until we find an inode chunk with a free inode. */ while (--searchdistance > 0 && (!doneleft || !doneright)) { int useleft; /* using left inode chunk this time */ /* figure out the closer block if both are valid. */ if (!doneleft && !doneright) { useleft = pagino - (trec.ir_startino + XFS_INODES_PER_CHUNK - 1) < rec.ir_startino - pagino; } else { useleft = !doneleft; } /* free inodes to the left? */ if (useleft && trec.ir_freecount) { xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); cur = tcur; pag->pagl_leftrec = trec.ir_startino; pag->pagl_rightrec = rec.ir_startino; pag->pagl_pagino = pagino; rec = trec; goto alloc_inode; } /* free inodes to the right? */ if (!useleft && rec.ir_freecount) { xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); pag->pagl_leftrec = trec.ir_startino; pag->pagl_rightrec = rec.ir_startino; pag->pagl_pagino = pagino; goto alloc_inode; } /* get next record to check */ if (useleft) { error = xfs_ialloc_next_rec(tcur, &trec, &doneleft, 1); } else { error = xfs_ialloc_next_rec(cur, &rec, &doneright, 0); } if (error) goto error1; } if (searchdistance <= 0) { /* * Not in range - save last search * location and allocate a new inode */ xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); pag->pagl_leftrec = trec.ir_startino; pag->pagl_rightrec = rec.ir_startino; pag->pagl_pagino = pagino; } else { /* * We've reached the end of the btree. because * we are only searching a small chunk of the * btree each search, there is obviously free * inodes closer to the parent inode than we * are now. restart the search again. */ pag->pagl_pagino = NULLAGINO; pag->pagl_leftrec = NULLAGINO; pag->pagl_rightrec = NULLAGINO; xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); goto restart_pagno; } } /* * In a different AG from the parent. * See if the most recently allocated block has any free. */ if (agi->agi_newino != cpu_to_be32(NULLAGINO)) { error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino), XFS_LOOKUP_EQ, &i); if (error) goto error0; if (i == 1) { error = xfs_inobt_get_rec(cur, &rec, &j); if (error) goto error0; if (j == 1 && rec.ir_freecount > 0) { /* * The last chunk allocated in the group * still has a free inode. */ goto alloc_inode; } } } /* * None left in the last group, search the whole AG */ error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i); if (error) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { error = -EFSCORRUPTED; goto error0; } for (;;) { error = xfs_inobt_get_rec(cur, &rec, &i); if (error) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { error = -EFSCORRUPTED; goto error0; } if (rec.ir_freecount > 0) break; error = xfs_btree_increment(cur, 0, &i); if (error) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { error = -EFSCORRUPTED; goto error0; } } alloc_inode: offset = xfs_inobt_first_free_inode(&rec); ASSERT(offset >= 0); ASSERT(offset < XFS_INODES_PER_CHUNK); ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % XFS_INODES_PER_CHUNK) == 0); ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, rec.ir_startino + offset); rec.ir_free &= ~XFS_INOBT_MASK(offset); rec.ir_freecount--; error = xfs_inobt_update(cur, &rec); if (error) goto error0; be32_add_cpu(&agi->agi_freecount, -1); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); pag->pagi_freecount--; error = xfs_check_agi_freecount(cur); if (error) goto error0; xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1); *inop = ino; return 0; error1: xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); error0: xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); return error; } /* * Use the free inode btree to allocate an inode based on distance from the * parent. Note that the provided cursor may be deleted and replaced. */ STATIC int xfs_dialloc_ag_finobt_near( xfs_agino_t pagino, struct xfs_btree_cur **ocur, struct xfs_inobt_rec_incore *rec) { struct xfs_btree_cur *lcur = *ocur; /* left search cursor */ struct xfs_btree_cur *rcur; /* right search cursor */ struct xfs_inobt_rec_incore rrec; int error; int i, j; error = xfs_inobt_lookup(lcur, pagino, XFS_LOOKUP_LE, &i); if (error) return error; if (i == 1) { error = xfs_inobt_get_rec(lcur, rec, &i); if (error) return error; if (XFS_IS_CORRUPT(lcur->bc_mp, i != 1)) return -EFSCORRUPTED; /* * See if we've landed in the parent inode record. The finobt * only tracks chunks with at least one free inode, so record * existence is enough. */ if (pagino >= rec->ir_startino && pagino < (rec->ir_startino + XFS_INODES_PER_CHUNK)) return 0; } error = xfs_btree_dup_cursor(lcur, &rcur); if (error) return error; error = xfs_inobt_lookup(rcur, pagino, XFS_LOOKUP_GE, &j); if (error) goto error_rcur; if (j == 1) { error = xfs_inobt_get_rec(rcur, &rrec, &j); if (error) goto error_rcur; if (XFS_IS_CORRUPT(lcur->bc_mp, j != 1)) { error = -EFSCORRUPTED; goto error_rcur; } } if (XFS_IS_CORRUPT(lcur->bc_mp, i != 1 && j != 1)) { error = -EFSCORRUPTED; goto error_rcur; } if (i == 1 && j == 1) { /* * Both the left and right records are valid. Choose the closer * inode chunk to the target. */ if ((pagino - rec->ir_startino + XFS_INODES_PER_CHUNK - 1) > (rrec.ir_startino - pagino)) { *rec = rrec; xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR); *ocur = rcur; } else { xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR); } } else if (j == 1) { /* only the right record is valid */ *rec = rrec; xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR); *ocur = rcur; } else if (i == 1) { /* only the left record is valid */ xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR); } return 0; error_rcur: xfs_btree_del_cursor(rcur, XFS_BTREE_ERROR); return error; } /* * Use the free inode btree to find a free inode based on a newino hint. If * the hint is NULL, find the first free inode in the AG. */ STATIC int xfs_dialloc_ag_finobt_newino( struct xfs_agi *agi, struct xfs_btree_cur *cur, struct xfs_inobt_rec_incore *rec) { int error; int i; if (agi->agi_newino != cpu_to_be32(NULLAGINO)) { error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino), XFS_LOOKUP_EQ, &i); if (error) return error; if (i == 1) { error = xfs_inobt_get_rec(cur, rec, &i); if (error) return error; if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) return -EFSCORRUPTED; return 0; } } /* * Find the first inode available in the AG. */ error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i); if (error) return error; if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) return -EFSCORRUPTED; error = xfs_inobt_get_rec(cur, rec, &i); if (error) return error; if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) return -EFSCORRUPTED; return 0; } /* * Update the inobt based on a modification made to the finobt. Also ensure that * the records from both trees are equivalent post-modification. */ STATIC int xfs_dialloc_ag_update_inobt( struct xfs_btree_cur *cur, /* inobt cursor */ struct xfs_inobt_rec_incore *frec, /* finobt record */ int offset) /* inode offset */ { struct xfs_inobt_rec_incore rec; int error; int i; error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i); if (error) return error; if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) return -EFSCORRUPTED; error = xfs_inobt_get_rec(cur, &rec, &i); if (error) return error; if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) return -EFSCORRUPTED; ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) % XFS_INODES_PER_CHUNK) == 0); rec.ir_free &= ~XFS_INOBT_MASK(offset); rec.ir_freecount--; if (XFS_IS_CORRUPT(cur->bc_mp, rec.ir_free != frec->ir_free || rec.ir_freecount != frec->ir_freecount)) return -EFSCORRUPTED; return xfs_inobt_update(cur, &rec); } /* * Allocate an inode using the free inode btree, if available. Otherwise, fall * back to the inobt search algorithm. * * The caller selected an AG for us, and made sure that free inodes are * available. */ static int xfs_dialloc_ag( struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp, xfs_ino_t parent, xfs_ino_t *inop) { struct xfs_mount *mp = tp->t_mountp; struct xfs_agi *agi = agbp->b_addr; xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent); xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent); struct xfs_btree_cur *cur; /* finobt cursor */ struct xfs_btree_cur *icur; /* inobt cursor */ struct xfs_inobt_rec_incore rec; xfs_ino_t ino; int error; int offset; int i; if (!xfs_has_finobt(mp)) return xfs_dialloc_ag_inobt(pag, tp, agbp, parent, inop); /* * If pagino is 0 (this is the root inode allocation) use newino. * This must work because we've just allocated some. */ if (!pagino) pagino = be32_to_cpu(agi->agi_newino); cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_FINO); error = xfs_check_agi_freecount(cur); if (error) goto error_cur; /* * The search algorithm depends on whether we're in the same AG as the * parent. If so, find the closest available inode to the parent. If * not, consider the agi hint or find the first free inode in the AG. */ if (pag->pag_agno == pagno) error = xfs_dialloc_ag_finobt_near(pagino, &cur, &rec); else error = xfs_dialloc_ag_finobt_newino(agi, cur, &rec); if (error) goto error_cur; offset = xfs_inobt_first_free_inode(&rec); ASSERT(offset >= 0); ASSERT(offset < XFS_INODES_PER_CHUNK); ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % XFS_INODES_PER_CHUNK) == 0); ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, rec.ir_startino + offset); /* * Modify or remove the finobt record. */ rec.ir_free &= ~XFS_INOBT_MASK(offset); rec.ir_freecount--; if (rec.ir_freecount) error = xfs_inobt_update(cur, &rec); else error = xfs_btree_delete(cur, &i); if (error) goto error_cur; /* * The finobt has now been updated appropriately. We haven't updated the * agi and superblock yet, so we can create an inobt cursor and validate * the original freecount. If all is well, make the equivalent update to * the inobt using the finobt record and offset information. */ icur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_INO); error = xfs_check_agi_freecount(icur); if (error) goto error_icur; error = xfs_dialloc_ag_update_inobt(icur, &rec, offset); if (error) goto error_icur; /* * Both trees have now been updated. We must update the perag and * superblock before we can check the freecount for each btree. */ be32_add_cpu(&agi->agi_freecount, -1); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); pag->pagi_freecount--; xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1); error = xfs_check_agi_freecount(icur); if (error) goto error_icur; error = xfs_check_agi_freecount(cur); if (error) goto error_icur; xfs_btree_del_cursor(icur, XFS_BTREE_NOERROR); xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); *inop = ino; return 0; error_icur: xfs_btree_del_cursor(icur, XFS_BTREE_ERROR); error_cur: xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); return error; } static int xfs_dialloc_roll( struct xfs_trans **tpp, struct xfs_buf *agibp) { struct xfs_trans *tp = *tpp; struct xfs_dquot_acct *dqinfo; int error; /* * Hold to on to the agibp across the commit so no other allocation can * come in and take the free inodes we just allocated for our caller. */ xfs_trans_bhold(tp, agibp); /* * We want the quota changes to be associated with the next transaction, * NOT this one. So, detach the dqinfo from this and attach it to the * next transaction. */ dqinfo = tp->t_dqinfo; tp->t_dqinfo = NULL; error = xfs_trans_roll(&tp); /* Re-attach the quota info that we detached from prev trx. */ tp->t_dqinfo = dqinfo; /* * Join the buffer even on commit error so that the buffer is released * when the caller cancels the transaction and doesn't have to handle * this error case specially. */ xfs_trans_bjoin(tp, agibp); *tpp = tp; return error; } static bool xfs_dialloc_good_ag( struct xfs_perag *pag, struct xfs_trans *tp, umode_t mode, int flags, bool ok_alloc) { struct xfs_mount *mp = tp->t_mountp; xfs_extlen_t ineed; xfs_extlen_t longest = 0; int needspace; int error; if (!pag) return false; if (!xfs_perag_allows_inodes(pag)) return false; if (!xfs_perag_initialised_agi(pag)) { error = xfs_ialloc_read_agi(pag, tp, NULL); if (error) return false; } if (pag->pagi_freecount) return true; if (!ok_alloc) return false; if (!xfs_perag_initialised_agf(pag)) { error = xfs_alloc_read_agf(pag, tp, flags, NULL); if (error) return false; } /* * Check that there is enough free space for the file plus a chunk of * inodes if we need to allocate some. If this is the first pass across * the AGs, take into account the potential space needed for alignment * of inode chunks when checking the longest contiguous free space in * the AG - this prevents us from getting ENOSPC because we have free * space larger than ialloc_blks but alignment constraints prevent us * from using it. * * If we can't find an AG with space for full alignment slack to be * taken into account, we must be near ENOSPC in all AGs. Hence we * don't include alignment for the second pass and so if we fail * allocation due to alignment issues then it is most likely a real * ENOSPC condition. * * XXX(dgc): this calculation is now bogus thanks to the per-ag * reservations that xfs_alloc_fix_freelist() now does via * xfs_alloc_space_available(). When the AG fills up, pagf_freeblks will * be more than large enough for the check below to succeed, but * xfs_alloc_space_available() will fail because of the non-zero * metadata reservation and hence we won't actually be able to allocate * more inodes in this AG. We do soooo much unnecessary work near ENOSPC * because of this. */ ineed = M_IGEO(mp)->ialloc_min_blks; if (flags && ineed > 1) ineed += M_IGEO(mp)->cluster_align; longest = pag->pagf_longest; if (!longest) longest = pag->pagf_flcount > 0; needspace = S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode); if (pag->pagf_freeblks < needspace + ineed || longest < ineed) return false; return true; } static int xfs_dialloc_try_ag( struct xfs_perag *pag, struct xfs_trans **tpp, xfs_ino_t parent, xfs_ino_t *new_ino, bool ok_alloc) { struct xfs_buf *agbp; xfs_ino_t ino; int error; /* * Then read in the AGI buffer and recheck with the AGI buffer * lock held. */ error = xfs_ialloc_read_agi(pag, *tpp, &agbp); if (error) return error; if (!pag->pagi_freecount) { if (!ok_alloc) { error = -EAGAIN; goto out_release; } error = xfs_ialloc_ag_alloc(pag, *tpp, agbp); if (error < 0) goto out_release; /* * We successfully allocated space for an inode cluster in this * AG. Roll the transaction so that we can allocate one of the * new inodes. */ ASSERT(pag->pagi_freecount > 0); error = xfs_dialloc_roll(tpp, agbp); if (error) goto out_release; } /* Allocate an inode in the found AG */ error = xfs_dialloc_ag(pag, *tpp, agbp, parent, &ino); if (!error) *new_ino = ino; return error; out_release: xfs_trans_brelse(*tpp, agbp); return error; } /* * Allocate an on-disk inode. * * Mode is used to tell whether the new inode is a directory and hence where to * locate it. The on-disk inode that is allocated will be returned in @new_ino * on success, otherwise an error will be set to indicate the failure (e.g. * -ENOSPC). */ int xfs_dialloc( struct xfs_trans **tpp, xfs_ino_t parent, umode_t mode, xfs_ino_t *new_ino) { struct xfs_mount *mp = (*tpp)->t_mountp; xfs_agnumber_t agno; int error = 0; xfs_agnumber_t start_agno; struct xfs_perag *pag; struct xfs_ino_geometry *igeo = M_IGEO(mp); bool ok_alloc = true; bool low_space = false; int flags; xfs_ino_t ino = NULLFSINO; /* * Directories, symlinks, and regular files frequently allocate at least * one block, so factor that potential expansion when we examine whether * an AG has enough space for file creation. */ if (S_ISDIR(mode)) start_agno = (atomic_inc_return(&mp->m_agirotor) - 1) % mp->m_maxagi; else { start_agno = XFS_INO_TO_AGNO(mp, parent); if (start_agno >= mp->m_maxagi) start_agno = 0; } /* * If we have already hit the ceiling of inode blocks then clear * ok_alloc so we scan all available agi structures for a free * inode. * * Read rough value of mp->m_icount by percpu_counter_read_positive, * which will sacrifice the preciseness but improve the performance. */ if (igeo->maxicount && percpu_counter_read_positive(&mp->m_icount) + igeo->ialloc_inos > igeo->maxicount) { ok_alloc = false; } /* * If we are near to ENOSPC, we want to prefer allocation from AGs that * have free inodes in them rather than use up free space allocating new * inode chunks. Hence we turn off allocation for the first non-blocking * pass through the AGs if we are near ENOSPC to consume free inodes * that we can immediately allocate, but then we allow allocation on the * second pass if we fail to find an AG with free inodes in it. */ if (percpu_counter_read_positive(&mp->m_fdblocks) < mp->m_low_space[XFS_LOWSP_1_PCNT]) { ok_alloc = false; low_space = true; } /* * Loop until we find an allocation group that either has free inodes * or in which we can allocate some inodes. Iterate through the * allocation groups upward, wrapping at the end. */ flags = XFS_ALLOC_FLAG_TRYLOCK; retry: for_each_perag_wrap_at(mp, start_agno, mp->m_maxagi, agno, pag) { if (xfs_dialloc_good_ag(pag, *tpp, mode, flags, ok_alloc)) { error = xfs_dialloc_try_ag(pag, tpp, parent, &ino, ok_alloc); if (error != -EAGAIN) break; error = 0; } if (xfs_is_shutdown(mp)) { error = -EFSCORRUPTED; break; } } if (pag) xfs_perag_rele(pag); if (error) return error; if (ino == NULLFSINO) { if (flags) { flags = 0; if (low_space) ok_alloc = true; goto retry; } return -ENOSPC; } *new_ino = ino; return 0; } /* * Free the blocks of an inode chunk. We must consider that the inode chunk * might be sparse and only free the regions that are allocated as part of the * chunk. */ static int xfs_difree_inode_chunk( struct xfs_trans *tp, xfs_agnumber_t agno, struct xfs_inobt_rec_incore *rec) { struct xfs_mount *mp = tp->t_mountp; xfs_agblock_t sagbno = XFS_AGINO_TO_AGBNO(mp, rec->ir_startino); int startidx, endidx; int nextbit; xfs_agblock_t agbno; int contigblk; DECLARE_BITMAP(holemask, XFS_INOBT_HOLEMASK_BITS); if (!xfs_inobt_issparse(rec->ir_holemask)) { /* not sparse, calculate extent info directly */ return xfs_free_extent_later(tp, XFS_AGB_TO_FSB(mp, agno, sagbno), M_IGEO(mp)->ialloc_blks, &XFS_RMAP_OINFO_INODES, XFS_AG_RESV_NONE, false); } /* holemask is only 16-bits (fits in an unsigned long) */ ASSERT(sizeof(rec->ir_holemask) <= sizeof(holemask[0])); holemask[0] = rec->ir_holemask; /* * Find contiguous ranges of zeroes (i.e., allocated regions) in the * holemask and convert the start/end index of each range to an extent. * We start with the start and end index both pointing at the first 0 in * the mask. */ startidx = endidx = find_first_zero_bit(holemask, XFS_INOBT_HOLEMASK_BITS); nextbit = startidx + 1; while (startidx < XFS_INOBT_HOLEMASK_BITS) { int error; nextbit = find_next_zero_bit(holemask, XFS_INOBT_HOLEMASK_BITS, nextbit); /* * If the next zero bit is contiguous, update the end index of * the current range and continue. */ if (nextbit != XFS_INOBT_HOLEMASK_BITS && nextbit == endidx + 1) { endidx = nextbit; goto next; } /* * nextbit is not contiguous with the current end index. Convert * the current start/end to an extent and add it to the free * list. */ agbno = sagbno + (startidx * XFS_INODES_PER_HOLEMASK_BIT) / mp->m_sb.sb_inopblock; contigblk = ((endidx - startidx + 1) * XFS_INODES_PER_HOLEMASK_BIT) / mp->m_sb.sb_inopblock; ASSERT(agbno % mp->m_sb.sb_spino_align == 0); ASSERT(contigblk % mp->m_sb.sb_spino_align == 0); error = xfs_free_extent_later(tp, XFS_AGB_TO_FSB(mp, agno, agbno), contigblk, &XFS_RMAP_OINFO_INODES, XFS_AG_RESV_NONE, false); if (error) return error; /* reset range to current bit and carry on... */ startidx = endidx = nextbit; next: nextbit++; } return 0; } STATIC int xfs_difree_inobt( struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp, xfs_agino_t agino, struct xfs_icluster *xic, struct xfs_inobt_rec_incore *orec) { struct xfs_mount *mp = pag->pag_mount; struct xfs_agi *agi = agbp->b_addr; struct xfs_btree_cur *cur; struct xfs_inobt_rec_incore rec; int ilen; int error; int i; int off; ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC)); ASSERT(XFS_AGINO_TO_AGBNO(mp, agino) < be32_to_cpu(agi->agi_length)); /* * Initialize the cursor. */ cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_INO); error = xfs_check_agi_freecount(cur); if (error) goto error0; /* * Look for the entry describing this inode. */ if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) { xfs_warn(mp, "%s: xfs_inobt_lookup() returned error %d.", __func__, error); goto error0; } if (XFS_IS_CORRUPT(mp, i != 1)) { error = -EFSCORRUPTED; goto error0; } error = xfs_inobt_get_rec(cur, &rec, &i); if (error) { xfs_warn(mp, "%s: xfs_inobt_get_rec() returned error %d.", __func__, error); goto error0; } if (XFS_IS_CORRUPT(mp, i != 1)) { error = -EFSCORRUPTED; goto error0; } /* * Get the offset in the inode chunk. */ off = agino - rec.ir_startino; ASSERT(off >= 0 && off < XFS_INODES_PER_CHUNK); ASSERT(!(rec.ir_free & XFS_INOBT_MASK(off))); /* * Mark the inode free & increment the count. */ rec.ir_free |= XFS_INOBT_MASK(off); rec.ir_freecount++; /* * When an inode chunk is free, it becomes eligible for removal. Don't * remove the chunk if the block size is large enough for multiple inode * chunks (that might not be free). */ if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE && mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) { xic->deleted = true; xic->first_ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, rec.ir_startino); xic->alloc = xfs_inobt_irec_to_allocmask(&rec); /* * Remove the inode cluster from the AGI B+Tree, adjust the * AGI and Superblock inode counts, and mark the disk space * to be freed when the transaction is committed. */ ilen = rec.ir_freecount; be32_add_cpu(&agi->agi_count, -ilen); be32_add_cpu(&agi->agi_freecount, -(ilen - 1)); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT); pag->pagi_freecount -= ilen - 1; pag->pagi_count -= ilen; xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen); xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1)); if ((error = xfs_btree_delete(cur, &i))) { xfs_warn(mp, "%s: xfs_btree_delete returned error %d.", __func__, error); goto error0; } error = xfs_difree_inode_chunk(tp, pag->pag_agno, &rec); if (error) goto error0; } else { xic->deleted = false; error = xfs_inobt_update(cur, &rec); if (error) { xfs_warn(mp, "%s: xfs_inobt_update returned error %d.", __func__, error); goto error0; } /* * Change the inode free counts and log the ag/sb changes. */ be32_add_cpu(&agi->agi_freecount, 1); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); pag->pagi_freecount++; xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1); } error = xfs_check_agi_freecount(cur); if (error) goto error0; *orec = rec; xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); return 0; error0: xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); return error; } /* * Free an inode in the free inode btree. */ STATIC int xfs_difree_finobt( struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp, xfs_agino_t agino, struct xfs_inobt_rec_incore *ibtrec) /* inobt record */ { struct xfs_mount *mp = pag->pag_mount; struct xfs_btree_cur *cur; struct xfs_inobt_rec_incore rec; int offset = agino - ibtrec->ir_startino; int error; int i; cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_FINO); error = xfs_inobt_lookup(cur, ibtrec->ir_startino, XFS_LOOKUP_EQ, &i); if (error) goto error; if (i == 0) { /* * If the record does not exist in the finobt, we must have just * freed an inode in a previously fully allocated chunk. If not, * something is out of sync. */ if (XFS_IS_CORRUPT(mp, ibtrec->ir_freecount != 1)) { error = -EFSCORRUPTED; goto error; } error = xfs_inobt_insert_rec(cur, ibtrec->ir_holemask, ibtrec->ir_count, ibtrec->ir_freecount, ibtrec->ir_free, &i); if (error) goto error; ASSERT(i == 1); goto out; } /* * Read and update the existing record. We could just copy the ibtrec * across here, but that would defeat the purpose of having redundant * metadata. By making the modifications independently, we can catch * corruptions that we wouldn't see if we just copied from one record * to another. */ error = xfs_inobt_get_rec(cur, &rec, &i); if (error) goto error; if (XFS_IS_CORRUPT(mp, i != 1)) { error = -EFSCORRUPTED; goto error; } rec.ir_free |= XFS_INOBT_MASK(offset); rec.ir_freecount++; if (XFS_IS_CORRUPT(mp, rec.ir_free != ibtrec->ir_free || rec.ir_freecount != ibtrec->ir_freecount)) { error = -EFSCORRUPTED; goto error; } /* * The content of inobt records should always match between the inobt * and finobt. The lifecycle of records in the finobt is different from * the inobt in that the finobt only tracks records with at least one * free inode. Hence, if all of the inodes are free and we aren't * keeping inode chunks permanently on disk, remove the record. * Otherwise, update the record with the new information. * * Note that we currently can't free chunks when the block size is large * enough for multiple chunks. Leave the finobt record to remain in sync * with the inobt. */ if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE && mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) { error = xfs_btree_delete(cur, &i); if (error) goto error; ASSERT(i == 1); } else { error = xfs_inobt_update(cur, &rec); if (error) goto error; } out: error = xfs_check_agi_freecount(cur); if (error) goto error; xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); return 0; error: xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); return error; } /* * Free disk inode. Carefully avoids touching the incore inode, all * manipulations incore are the caller's responsibility. * The on-disk inode is not changed by this operation, only the * btree (free inode mask) is changed. */ int xfs_difree( struct xfs_trans *tp, struct xfs_perag *pag, xfs_ino_t inode, struct xfs_icluster *xic) { /* REFERENCED */ xfs_agblock_t agbno; /* block number containing inode */ struct xfs_buf *agbp; /* buffer for allocation group header */ xfs_agino_t agino; /* allocation group inode number */ int error; /* error return value */ struct xfs_mount *mp = tp->t_mountp; struct xfs_inobt_rec_incore rec;/* btree record */ /* * Break up inode number into its components. */ if (pag->pag_agno != XFS_INO_TO_AGNO(mp, inode)) { xfs_warn(mp, "%s: agno != pag->pag_agno (%d != %d).", __func__, XFS_INO_TO_AGNO(mp, inode), pag->pag_agno); ASSERT(0); return -EINVAL; } agino = XFS_INO_TO_AGINO(mp, inode); if (inode != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) { xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).", __func__, (unsigned long long)inode, (unsigned long long)XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)); ASSERT(0); return -EINVAL; } agbno = XFS_AGINO_TO_AGBNO(mp, agino); if (agbno >= mp->m_sb.sb_agblocks) { xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).", __func__, agbno, mp->m_sb.sb_agblocks); ASSERT(0); return -EINVAL; } /* * Get the allocation group header. */ error = xfs_ialloc_read_agi(pag, tp, &agbp); if (error) { xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.", __func__, error); return error; } /* * Fix up the inode allocation btree. */ error = xfs_difree_inobt(pag, tp, agbp, agino, xic, &rec); if (error) goto error0; /* * Fix up the free inode btree. */ if (xfs_has_finobt(mp)) { error = xfs_difree_finobt(pag, tp, agbp, agino, &rec); if (error) goto error0; } return 0; error0: return error; } STATIC int xfs_imap_lookup( struct xfs_perag *pag, struct xfs_trans *tp, xfs_agino_t agino, xfs_agblock_t agbno, xfs_agblock_t *chunk_agbno, xfs_agblock_t *offset_agbno, int flags) { struct xfs_mount *mp = pag->pag_mount; struct xfs_inobt_rec_incore rec; struct xfs_btree_cur *cur; struct xfs_buf *agbp; int error; int i; error = xfs_ialloc_read_agi(pag, tp, &agbp); if (error) { xfs_alert(mp, "%s: xfs_ialloc_read_agi() returned error %d, agno %d", __func__, error, pag->pag_agno); return error; } /* * Lookup the inode record for the given agino. If the record cannot be * found, then it's an invalid inode number and we should abort. Once * we have a record, we need to ensure it contains the inode number * we are looking up. */ cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_INO); error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i); if (!error) { if (i) error = xfs_inobt_get_rec(cur, &rec, &i); if (!error && i == 0) error = -EINVAL; } xfs_trans_brelse(tp, agbp); xfs_btree_del_cursor(cur, error); if (error) return error; /* check that the returned record contains the required inode */ if (rec.ir_startino > agino || rec.ir_startino + M_IGEO(mp)->ialloc_inos <= agino) return -EINVAL; /* for untrusted inodes check it is allocated first */ if ((flags & XFS_IGET_UNTRUSTED) && (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino))) return -EINVAL; *chunk_agbno = XFS_AGINO_TO_AGBNO(mp, rec.ir_startino); *offset_agbno = agbno - *chunk_agbno; return 0; } /* * Return the location of the inode in imap, for mapping it into a buffer. */ int xfs_imap( struct xfs_perag *pag, struct xfs_trans *tp, xfs_ino_t ino, /* inode to locate */ struct xfs_imap *imap, /* location map structure */ uint flags) /* flags for inode btree lookup */ { struct xfs_mount *mp = pag->pag_mount; xfs_agblock_t agbno; /* block number of inode in the alloc group */ xfs_agino_t agino; /* inode number within alloc group */ xfs_agblock_t chunk_agbno; /* first block in inode chunk */ xfs_agblock_t cluster_agbno; /* first block in inode cluster */ int error; /* error code */ int offset; /* index of inode in its buffer */ xfs_agblock_t offset_agbno; /* blks from chunk start to inode */ ASSERT(ino != NULLFSINO); /* * Split up the inode number into its parts. */ agino = XFS_INO_TO_AGINO(mp, ino); agbno = XFS_AGINO_TO_AGBNO(mp, agino); if (agbno >= mp->m_sb.sb_agblocks || ino != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) { error = -EINVAL; #ifdef DEBUG /* * Don't output diagnostic information for untrusted inodes * as they can be invalid without implying corruption. */ if (flags & XFS_IGET_UNTRUSTED) return error; if (agbno >= mp->m_sb.sb_agblocks) { xfs_alert(mp, "%s: agbno (0x%llx) >= mp->m_sb.sb_agblocks (0x%lx)", __func__, (unsigned long long)agbno, (unsigned long)mp->m_sb.sb_agblocks); } if (ino != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) { xfs_alert(mp, "%s: ino (0x%llx) != XFS_AGINO_TO_INO() (0x%llx)", __func__, ino, XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)); } xfs_stack_trace(); #endif /* DEBUG */ return error; } /* * For bulkstat and handle lookups, we have an untrusted inode number * that we have to verify is valid. We cannot do this just by reading * the inode buffer as it may have been unlinked and removed leaving * inodes in stale state on disk. Hence we have to do a btree lookup * in all cases where an untrusted inode number is passed. */ if (flags & XFS_IGET_UNTRUSTED) { error = xfs_imap_lookup(pag, tp, agino, agbno, &chunk_agbno, &offset_agbno, flags); if (error) return error; goto out_map; } /* * If the inode cluster size is the same as the blocksize or * smaller we get to the buffer by simple arithmetics. */ if (M_IGEO(mp)->blocks_per_cluster == 1) { offset = XFS_INO_TO_OFFSET(mp, ino); ASSERT(offset < mp->m_sb.sb_inopblock); imap->im_blkno = XFS_AGB_TO_DADDR(mp, pag->pag_agno, agbno); imap->im_len = XFS_FSB_TO_BB(mp, 1); imap->im_boffset = (unsigned short)(offset << mp->m_sb.sb_inodelog); return 0; } /* * If the inode chunks are aligned then use simple maths to * find the location. Otherwise we have to do a btree * lookup to find the location. */ if (M_IGEO(mp)->inoalign_mask) { offset_agbno = agbno & M_IGEO(mp)->inoalign_mask; chunk_agbno = agbno - offset_agbno; } else { error = xfs_imap_lookup(pag, tp, agino, agbno, &chunk_agbno, &offset_agbno, flags); if (error) return error; } out_map: ASSERT(agbno >= chunk_agbno); cluster_agbno = chunk_agbno + ((offset_agbno / M_IGEO(mp)->blocks_per_cluster) * M_IGEO(mp)->blocks_per_cluster); offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) + XFS_INO_TO_OFFSET(mp, ino); imap->im_blkno = XFS_AGB_TO_DADDR(mp, pag->pag_agno, cluster_agbno); imap->im_len = XFS_FSB_TO_BB(mp, M_IGEO(mp)->blocks_per_cluster); imap->im_boffset = (unsigned short)(offset << mp->m_sb.sb_inodelog); /* * If the inode number maps to a block outside the bounds * of the file system then return NULL rather than calling * read_buf and panicing when we get an error from the * driver. */ if ((imap->im_blkno + imap->im_len) > XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) { xfs_alert(mp, "%s: (im_blkno (0x%llx) + im_len (0x%llx)) > sb_dblocks (0x%llx)", __func__, (unsigned long long) imap->im_blkno, (unsigned long long) imap->im_len, XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)); return -EINVAL; } return 0; } /* * Log specified fields for the ag hdr (inode section). The growth of the agi * structure over time requires that we interpret the buffer as two logical * regions delineated by the end of the unlinked list. This is due to the size * of the hash table and its location in the middle of the agi. * * For example, a request to log a field before agi_unlinked and a field after * agi_unlinked could cause us to log the entire hash table and use an excessive * amount of log space. To avoid this behavior, log the region up through * agi_unlinked in one call and the region after agi_unlinked through the end of * the structure in another. */ void xfs_ialloc_log_agi( struct xfs_trans *tp, struct xfs_buf *bp, uint32_t fields) { int first; /* first byte number */ int last; /* last byte number */ static const short offsets[] = { /* field starting offsets */ /* keep in sync with bit definitions */ offsetof(xfs_agi_t, agi_magicnum), offsetof(xfs_agi_t, agi_versionnum), offsetof(xfs_agi_t, agi_seqno), offsetof(xfs_agi_t, agi_length), offsetof(xfs_agi_t, agi_count), offsetof(xfs_agi_t, agi_root), offsetof(xfs_agi_t, agi_level), offsetof(xfs_agi_t, agi_freecount), offsetof(xfs_agi_t, agi_newino), offsetof(xfs_agi_t, agi_dirino), offsetof(xfs_agi_t, agi_unlinked), offsetof(xfs_agi_t, agi_free_root), offsetof(xfs_agi_t, agi_free_level), offsetof(xfs_agi_t, agi_iblocks), sizeof(xfs_agi_t) }; #ifdef DEBUG struct xfs_agi *agi = bp->b_addr; ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC)); #endif /* * Compute byte offsets for the first and last fields in the first * region and log the agi buffer. This only logs up through * agi_unlinked. */ if (fields & XFS_AGI_ALL_BITS_R1) { xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R1, &first, &last); xfs_trans_log_buf(tp, bp, first, last); } /* * Mask off the bits in the first region and calculate the first and * last field offsets for any bits in the second region. */ fields &= ~XFS_AGI_ALL_BITS_R1; if (fields) { xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R2, &first, &last); xfs_trans_log_buf(tp, bp, first, last); } } static xfs_failaddr_t xfs_agi_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_mount; struct xfs_agi *agi = bp->b_addr; xfs_failaddr_t fa; uint32_t agi_seqno = be32_to_cpu(agi->agi_seqno); uint32_t agi_length = be32_to_cpu(agi->agi_length); int i; if (xfs_has_crc(mp)) { if (!uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(agi->agi_lsn))) return __this_address; } /* * Validate the magic number of the agi block. */ if (!xfs_verify_magic(bp, agi->agi_magicnum)) return __this_address; if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum))) return __this_address; fa = xfs_validate_ag_length(bp, agi_seqno, agi_length); if (fa) return fa; if (be32_to_cpu(agi->agi_level) < 1 || be32_to_cpu(agi->agi_level) > M_IGEO(mp)->inobt_maxlevels) return __this_address; if (xfs_has_finobt(mp) && (be32_to_cpu(agi->agi_free_level) < 1 || be32_to_cpu(agi->agi_free_level) > M_IGEO(mp)->inobt_maxlevels)) return __this_address; for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) { if (agi->agi_unlinked[i] == cpu_to_be32(NULLAGINO)) continue; if (!xfs_verify_ino(mp, be32_to_cpu(agi->agi_unlinked[i]))) return __this_address; } return NULL; } static void xfs_agi_read_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_mount; xfs_failaddr_t fa; if (xfs_has_crc(mp) && !xfs_buf_verify_cksum(bp, XFS_AGI_CRC_OFF)) xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { fa = xfs_agi_verify(bp); if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_IALLOC_READ_AGI)) xfs_verifier_error(bp, -EFSCORRUPTED, fa); } } static void xfs_agi_write_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_mount; struct xfs_buf_log_item *bip = bp->b_log_item; struct xfs_agi *agi = bp->b_addr; xfs_failaddr_t fa; fa = xfs_agi_verify(bp); if (fa) { xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } if (!xfs_has_crc(mp)) return; if (bip) agi->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn); xfs_buf_update_cksum(bp, XFS_AGI_CRC_OFF); } const struct xfs_buf_ops xfs_agi_buf_ops = { .name = "xfs_agi", .magic = { cpu_to_be32(XFS_AGI_MAGIC), cpu_to_be32(XFS_AGI_MAGIC) }, .verify_read = xfs_agi_read_verify, .verify_write = xfs_agi_write_verify, .verify_struct = xfs_agi_verify, }; /* * Read in the allocation group header (inode allocation section) */ int xfs_read_agi( struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf **agibpp) { struct xfs_mount *mp = pag->pag_mount; int error; trace_xfs_read_agi(pag->pag_mount, pag->pag_agno); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGI_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), 0, agibpp, &xfs_agi_buf_ops); if (error) return error; if (tp) xfs_trans_buf_set_type(tp, *agibpp, XFS_BLFT_AGI_BUF); xfs_buf_set_ref(*agibpp, XFS_AGI_REF); return 0; } /* * Read in the agi and initialise the per-ag data. If the caller supplies a * @agibpp, return the locked AGI buffer to them, otherwise release it. */ int xfs_ialloc_read_agi( struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf **agibpp) { struct xfs_buf *agibp; struct xfs_agi *agi; int error; trace_xfs_ialloc_read_agi(pag->pag_mount, pag->pag_agno); error = xfs_read_agi(pag, tp, &agibp); if (error) return error; agi = agibp->b_addr; if (!xfs_perag_initialised_agi(pag)) { pag->pagi_freecount = be32_to_cpu(agi->agi_freecount); pag->pagi_count = be32_to_cpu(agi->agi_count); set_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate); } /* * It's possible for these to be out of sync if * we are in the middle of a forced shutdown. */ ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) || xfs_is_shutdown(pag->pag_mount)); if (agibpp) *agibpp = agibp; else xfs_trans_brelse(tp, agibp); return 0; } /* How many inodes are backed by inode clusters ondisk? */ STATIC int xfs_ialloc_count_ondisk( struct xfs_btree_cur *cur, xfs_agino_t low, xfs_agino_t high, unsigned int *allocated) { struct xfs_inobt_rec_incore irec; unsigned int ret = 0; int has_record; int error; error = xfs_inobt_lookup(cur, low, XFS_LOOKUP_LE, &has_record); if (error) return error; while (has_record) { unsigned int i, hole_idx; error = xfs_inobt_get_rec(cur, &irec, &has_record); if (error) return error; if (irec.ir_startino > high) break; for (i = 0; i < XFS_INODES_PER_CHUNK; i++) { if (irec.ir_startino + i < low) continue; if (irec.ir_startino + i > high) break; hole_idx = i / XFS_INODES_PER_HOLEMASK_BIT; if (!(irec.ir_holemask & (1U << hole_idx))) ret++; } error = xfs_btree_increment(cur, 0, &has_record); if (error) return error; } *allocated = ret; return 0; } /* Is there an inode record covering a given extent? */ int xfs_ialloc_has_inodes_at_extent( struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, enum xbtree_recpacking *outcome) { xfs_agino_t agino; xfs_agino_t last_agino; unsigned int allocated; int error; agino = XFS_AGB_TO_AGINO(cur->bc_mp, bno); last_agino = XFS_AGB_TO_AGINO(cur->bc_mp, bno + len) - 1; error = xfs_ialloc_count_ondisk(cur, agino, last_agino, &allocated); if (error) return error; if (allocated == 0) *outcome = XBTREE_RECPACKING_EMPTY; else if (allocated == last_agino - agino + 1) *outcome = XBTREE_RECPACKING_FULL; else *outcome = XBTREE_RECPACKING_SPARSE; return 0; } struct xfs_ialloc_count_inodes { xfs_agino_t count; xfs_agino_t freecount; }; /* Record inode counts across all inobt records. */ STATIC int xfs_ialloc_count_inodes_rec( struct xfs_btree_cur *cur, const union xfs_btree_rec *rec, void *priv) { struct xfs_inobt_rec_incore irec; struct xfs_ialloc_count_inodes *ci = priv; xfs_failaddr_t fa; xfs_inobt_btrec_to_irec(cur->bc_mp, rec, &irec); fa = xfs_inobt_check_irec(cur->bc_ag.pag, &irec); if (fa) return xfs_inobt_complain_bad_rec(cur, fa, &irec); ci->count += irec.ir_count; ci->freecount += irec.ir_freecount; return 0; } /* Count allocated and free inodes under an inobt. */ int xfs_ialloc_count_inodes( struct xfs_btree_cur *cur, xfs_agino_t *count, xfs_agino_t *freecount) { struct xfs_ialloc_count_inodes ci = {0}; int error; ASSERT(cur->bc_btnum == XFS_BTNUM_INO); error = xfs_btree_query_all(cur, xfs_ialloc_count_inodes_rec, &ci); if (error) return error; *count = ci.count; *freecount = ci.freecount; return 0; } /* * Initialize inode-related geometry information. * * Compute the inode btree min and max levels and set maxicount. * * Set the inode cluster size. This may still be overridden by the file * system block size if it is larger than the chosen cluster size. * * For v5 filesystems, scale the cluster size with the inode size to keep a * constant ratio of inode per cluster buffer, but only if mkfs has set the * inode alignment value appropriately for larger cluster sizes. * * Then compute the inode cluster alignment information. */ void xfs_ialloc_setup_geometry( struct xfs_mount *mp) { struct xfs_sb *sbp = &mp->m_sb; struct xfs_ino_geometry *igeo = M_IGEO(mp); uint64_t icount; uint inodes; igeo->new_diflags2 = 0; if (xfs_has_bigtime(mp)) igeo->new_diflags2 |= XFS_DIFLAG2_BIGTIME; if (xfs_has_large_extent_counts(mp)) igeo->new_diflags2 |= XFS_DIFLAG2_NREXT64; /* Compute inode btree geometry. */ igeo->agino_log = sbp->sb_inopblog + sbp->sb_agblklog; igeo->inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1); igeo->inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0); igeo->inobt_mnr[0] = igeo->inobt_mxr[0] / 2; igeo->inobt_mnr[1] = igeo->inobt_mxr[1] / 2; igeo->ialloc_inos = max_t(uint16_t, XFS_INODES_PER_CHUNK, sbp->sb_inopblock); igeo->ialloc_blks = igeo->ialloc_inos >> sbp->sb_inopblog; if (sbp->sb_spino_align) igeo->ialloc_min_blks = sbp->sb_spino_align; else igeo->ialloc_min_blks = igeo->ialloc_blks; /* Compute and fill in value of m_ino_geo.inobt_maxlevels. */ inodes = (1LL << XFS_INO_AGINO_BITS(mp)) >> XFS_INODES_PER_CHUNK_LOG; igeo->inobt_maxlevels = xfs_btree_compute_maxlevels(igeo->inobt_mnr, inodes); ASSERT(igeo->inobt_maxlevels <= xfs_iallocbt_maxlevels_ondisk()); /* * Set the maximum inode count for this filesystem, being careful not * to use obviously garbage sb_inopblog/sb_inopblock values. Regular * users should never get here due to failing sb verification, but * certain users (xfs_db) need to be usable even with corrupt metadata. */ if (sbp->sb_imax_pct && igeo->ialloc_blks) { /* * Make sure the maximum inode count is a multiple * of the units we allocate inodes in. */ icount = sbp->sb_dblocks * sbp->sb_imax_pct; do_div(icount, 100); do_div(icount, igeo->ialloc_blks); igeo->maxicount = XFS_FSB_TO_INO(mp, icount * igeo->ialloc_blks); } else { igeo->maxicount = 0; } /* * Compute the desired size of an inode cluster buffer size, which * starts at 8K and (on v5 filesystems) scales up with larger inode * sizes. * * Preserve the desired inode cluster size because the sparse inodes * feature uses that desired size (not the actual size) to compute the * sparse inode alignment. The mount code validates this value, so we * cannot change the behavior. */ igeo->inode_cluster_size_raw = XFS_INODE_BIG_CLUSTER_SIZE; if (xfs_has_v3inodes(mp)) { int new_size = igeo->inode_cluster_size_raw; new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE; if (mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, new_size)) igeo->inode_cluster_size_raw = new_size; } /* Calculate inode cluster ratios. */ if (igeo->inode_cluster_size_raw > mp->m_sb.sb_blocksize) igeo->blocks_per_cluster = XFS_B_TO_FSBT(mp, igeo->inode_cluster_size_raw); else igeo->blocks_per_cluster = 1; igeo->inode_cluster_size = XFS_FSB_TO_B(mp, igeo->blocks_per_cluster); igeo->inodes_per_cluster = XFS_FSB_TO_INO(mp, igeo->blocks_per_cluster); /* Calculate inode cluster alignment. */ if (xfs_has_align(mp) && mp->m_sb.sb_inoalignmt >= igeo->blocks_per_cluster) igeo->cluster_align = mp->m_sb.sb_inoalignmt; else igeo->cluster_align = 1; igeo->inoalign_mask = igeo->cluster_align - 1; igeo->cluster_align_inodes = XFS_FSB_TO_INO(mp, igeo->cluster_align); /* * If we are using stripe alignment, check whether * the stripe unit is a multiple of the inode alignment */ if (mp->m_dalign && igeo->inoalign_mask && !(mp->m_dalign & igeo->inoalign_mask)) igeo->ialloc_align = mp->m_dalign; else igeo->ialloc_align = 0; } /* Compute the location of the root directory inode that is laid out by mkfs. */ xfs_ino_t xfs_ialloc_calc_rootino( struct xfs_mount *mp, int sunit) { struct xfs_ino_geometry *igeo = M_IGEO(mp); xfs_agblock_t first_bno; /* * Pre-calculate the geometry of AG 0. We know what it looks like * because libxfs knows how to create allocation groups now. * * first_bno is the first block in which mkfs could possibly have * allocated the root directory inode, once we factor in the metadata * that mkfs formats before it. Namely, the four AG headers... */ first_bno = howmany(4 * mp->m_sb.sb_sectsize, mp->m_sb.sb_blocksize); /* ...the two free space btree roots... */ first_bno += 2; /* ...the inode btree root... */ first_bno += 1; /* ...the initial AGFL... */ first_bno += xfs_alloc_min_freelist(mp, NULL); /* ...the free inode btree root... */ if (xfs_has_finobt(mp)) first_bno++; /* ...the reverse mapping btree root... */ if (xfs_has_rmapbt(mp)) first_bno++; /* ...the reference count btree... */ if (xfs_has_reflink(mp)) first_bno++; /* * ...and the log, if it is allocated in the first allocation group. * * This can happen with filesystems that only have a single * allocation group, or very odd geometries created by old mkfs * versions on very small filesystems. */ if (xfs_ag_contains_log(mp, 0)) first_bno += mp->m_sb.sb_logblocks; /* * Now round first_bno up to whatever allocation alignment is given * by the filesystem or was passed in. */ if (xfs_has_dalign(mp) && igeo->ialloc_align > 0) first_bno = roundup(first_bno, sunit); else if (xfs_has_align(mp) && mp->m_sb.sb_inoalignmt > 1) first_bno = roundup(first_bno, mp->m_sb.sb_inoalignmt); return XFS_AGINO_TO_INO(mp, 0, XFS_AGB_TO_AGINO(mp, first_bno)); } /* * Ensure there are not sparse inode clusters that cross the new EOAG. * * This is a no-op for non-spinode filesystems since clusters are always fully * allocated and checking the bnobt suffices. However, a spinode filesystem * could have a record where the upper inodes are free blocks. If those blocks * were removed from the filesystem, the inode record would extend beyond EOAG, * which will be flagged as corruption. */ int xfs_ialloc_check_shrink( struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agibp, xfs_agblock_t new_length) { struct xfs_inobt_rec_incore rec; struct xfs_btree_cur *cur; xfs_agino_t agino; int has; int error; if (!xfs_has_sparseinodes(pag->pag_mount)) return 0; cur = xfs_inobt_init_cursor(pag, tp, agibp, XFS_BTNUM_INO); /* Look up the inobt record that would correspond to the new EOFS. */ agino = XFS_AGB_TO_AGINO(pag->pag_mount, new_length); error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has); if (error || !has) goto out; error = xfs_inobt_get_rec(cur, &rec, &has); if (error) goto out; if (!has) { error = -EFSCORRUPTED; goto out; } /* If the record covers inodes that would be beyond EOFS, bail out. */ if (rec.ir_startino + XFS_INODES_PER_CHUNK > agino) { error = -ENOSPC; goto out; } out: xfs_btree_del_cursor(cur, error); return error; } |
1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 | // SPDX-License-Identifier: GPL-2.0 /* * Simple file system for zoned block devices exposing zones as files. * * Copyright (C) 2022 Western Digital Corporation or its affiliates. */ #include <linux/fs.h> #include <linux/seq_file.h> #include <linux/blkdev.h> #include "zonefs.h" struct zonefs_sysfs_attr { struct attribute attr; ssize_t (*show)(struct zonefs_sb_info *sbi, char *buf); }; #define ZONEFS_SYSFS_ATTR_RO(name) \ static struct zonefs_sysfs_attr zonefs_sysfs_attr_##name = __ATTR_RO(name) #define ATTR_LIST(name) &zonefs_sysfs_attr_##name.attr static ssize_t zonefs_sysfs_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct zonefs_sb_info *sbi = container_of(kobj, struct zonefs_sb_info, s_kobj); struct zonefs_sysfs_attr *zonefs_attr = container_of(attr, struct zonefs_sysfs_attr, attr); if (!zonefs_attr->show) return 0; return zonefs_attr->show(sbi, buf); } static ssize_t max_wro_seq_files_show(struct zonefs_sb_info *sbi, char *buf) { return sysfs_emit(buf, "%u\n", sbi->s_max_wro_seq_files); } ZONEFS_SYSFS_ATTR_RO(max_wro_seq_files); static ssize_t nr_wro_seq_files_show(struct zonefs_sb_info *sbi, char *buf) { return sysfs_emit(buf, "%d\n", atomic_read(&sbi->s_wro_seq_files)); } ZONEFS_SYSFS_ATTR_RO(nr_wro_seq_files); static ssize_t max_active_seq_files_show(struct zonefs_sb_info *sbi, char *buf) { return sysfs_emit(buf, "%u\n", sbi->s_max_active_seq_files); } ZONEFS_SYSFS_ATTR_RO(max_active_seq_files); static ssize_t nr_active_seq_files_show(struct zonefs_sb_info *sbi, char *buf) { return sysfs_emit(buf, "%d\n", atomic_read(&sbi->s_active_seq_files)); } ZONEFS_SYSFS_ATTR_RO(nr_active_seq_files); static struct attribute *zonefs_sysfs_attrs[] = { ATTR_LIST(max_wro_seq_files), ATTR_LIST(nr_wro_seq_files), ATTR_LIST(max_active_seq_files), ATTR_LIST(nr_active_seq_files), NULL, }; ATTRIBUTE_GROUPS(zonefs_sysfs); static void zonefs_sysfs_sb_release(struct kobject *kobj) { struct zonefs_sb_info *sbi = container_of(kobj, struct zonefs_sb_info, s_kobj); complete(&sbi->s_kobj_unregister); } static const struct sysfs_ops zonefs_sysfs_attr_ops = { .show = zonefs_sysfs_attr_show, }; static const struct kobj_type zonefs_sb_ktype = { .default_groups = zonefs_sysfs_groups, .sysfs_ops = &zonefs_sysfs_attr_ops, .release = zonefs_sysfs_sb_release, }; static struct kobject *zonefs_sysfs_root; int zonefs_sysfs_register(struct super_block *sb) { struct zonefs_sb_info *sbi = ZONEFS_SB(sb); int ret; init_completion(&sbi->s_kobj_unregister); ret = kobject_init_and_add(&sbi->s_kobj, &zonefs_sb_ktype, zonefs_sysfs_root, "%s", sb->s_id); if (ret) { kobject_put(&sbi->s_kobj); wait_for_completion(&sbi->s_kobj_unregister); return ret; } sbi->s_sysfs_registered = true; return 0; } void zonefs_sysfs_unregister(struct super_block *sb) { struct zonefs_sb_info *sbi = ZONEFS_SB(sb); if (!sbi || !sbi->s_sysfs_registered) return; kobject_del(&sbi->s_kobj); kobject_put(&sbi->s_kobj); wait_for_completion(&sbi->s_kobj_unregister); } int __init zonefs_sysfs_init(void) { zonefs_sysfs_root = kobject_create_and_add("zonefs", fs_kobj); if (!zonefs_sysfs_root) return -ENOMEM; return 0; } void zonefs_sysfs_exit(void) { kobject_put(zonefs_sysfs_root); zonefs_sysfs_root = NULL; } |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Memory-to-memory device framework for Video for Linux 2. * * Helper functions for devices that use memory buffers for both source * and destination. * * Copyright (c) 2009 Samsung Electronics Co., Ltd. * Pawel Osciak, <pawel@osciak.com> * Marek Szyprowski, <m.szyprowski@samsung.com> */ #ifndef _MEDIA_V4L2_MEM2MEM_H #define _MEDIA_V4L2_MEM2MEM_H #include <media/videobuf2-v4l2.h> /** * struct v4l2_m2m_ops - mem-to-mem device driver callbacks * @device_run: required. Begin the actual job (transaction) inside this * callback. * The job does NOT have to end before this callback returns * (and it will be the usual case). When the job finishes, * v4l2_m2m_job_finish() or v4l2_m2m_buf_done_and_job_finish() * has to be called. * @job_ready: optional. Should return 0 if the driver does not have a job * fully prepared to run yet (i.e. it will not be able to finish a * transaction without sleeping). If not provided, it will be * assumed that one source and one destination buffer are all * that is required for the driver to perform one full transaction. * This method may not sleep. * @job_abort: optional. Informs the driver that it has to abort the currently * running transaction as soon as possible (i.e. as soon as it can * stop the device safely; e.g. in the next interrupt handler), * even if the transaction would not have been finished by then. * After the driver performs the necessary steps, it has to call * v4l2_m2m_job_finish() or v4l2_m2m_buf_done_and_job_finish() as * if the transaction ended normally. * This function does not have to (and will usually not) wait * until the device enters a state when it can be stopped. */ struct v4l2_m2m_ops { void (*device_run)(void *priv); int (*job_ready)(void *priv); void (*job_abort)(void *priv); }; struct video_device; struct v4l2_m2m_dev; /** * struct v4l2_m2m_queue_ctx - represents a queue for buffers ready to be * processed * * @q: pointer to struct &vb2_queue * @rdy_queue: List of V4L2 mem-to-mem queues * @rdy_spinlock: spin lock to protect the struct usage * @num_rdy: number of buffers ready to be processed * @buffered: is the queue buffered? * * Queue for buffers ready to be processed as soon as this * instance receives access to the device. */ struct v4l2_m2m_queue_ctx { struct vb2_queue q; struct list_head rdy_queue; spinlock_t rdy_spinlock; u8 num_rdy; bool buffered; }; /** * struct v4l2_m2m_ctx - Memory to memory context structure * * @q_lock: struct &mutex lock * @new_frame: valid in the device_run callback: if true, then this * starts a new frame; if false, then this is a new slice * for an existing frame. This is always true unless * V4L2_BUF_CAP_SUPPORTS_M2M_HOLD_CAPTURE_BUF is set, which * indicates slicing support. * @is_draining: indicates device is in draining phase * @last_src_buf: indicate the last source buffer for draining * @next_buf_last: next capture queud buffer will be tagged as last * @has_stopped: indicate the device has been stopped * @ignore_cap_streaming: If true, job_ready can be called even if the CAPTURE * queue is not streaming. This allows firmware to * analyze the bitstream header which arrives on the * OUTPUT queue. The driver must implement the job_ready * callback correctly to make sure that the requirements * for actual decoding are met. * @m2m_dev: opaque pointer to the internal data to handle M2M context * @cap_q_ctx: Capture (output to memory) queue context * @out_q_ctx: Output (input from memory) queue context * @queue: List of memory to memory contexts * @job_flags: Job queue flags, used internally by v4l2-mem2mem.c: * %TRANS_QUEUED, %TRANS_RUNNING and %TRANS_ABORT. * @finished: Wait queue used to signalize when a job queue finished. * @priv: Instance private data * * The memory to memory context is specific to a file handle, NOT to e.g. * a device. */ struct v4l2_m2m_ctx { /* optional cap/out vb2 queues lock */ struct mutex *q_lock; bool new_frame; bool is_draining; struct vb2_v4l2_buffer *last_src_buf; bool next_buf_last; bool has_stopped; bool ignore_cap_streaming; /* internal use only */ struct v4l2_m2m_dev *m2m_dev; struct v4l2_m2m_queue_ctx cap_q_ctx; struct v4l2_m2m_queue_ctx out_q_ctx; /* For device job queue */ struct list_head queue; unsigned long job_flags; wait_queue_head_t finished; void *priv; }; /** * struct v4l2_m2m_buffer - Memory to memory buffer * * @vb: pointer to struct &vb2_v4l2_buffer * @list: list of m2m buffers */ struct v4l2_m2m_buffer { struct vb2_v4l2_buffer vb; struct list_head list; }; /** * v4l2_m2m_get_curr_priv() - return driver private data for the currently * running instance or NULL if no instance is running * * @m2m_dev: opaque pointer to the internal data to handle M2M context */ void *v4l2_m2m_get_curr_priv(struct v4l2_m2m_dev *m2m_dev); /** * v4l2_m2m_get_vq() - return vb2_queue for the given type * * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx * @type: type of the V4L2 buffer, as defined by enum &v4l2_buf_type */ struct vb2_queue *v4l2_m2m_get_vq(struct v4l2_m2m_ctx *m2m_ctx, enum v4l2_buf_type type); /** * v4l2_m2m_try_schedule() - check whether an instance is ready to be added to * the pending job queue and add it if so. * * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx * * There are three basic requirements an instance has to meet to be able to run: * 1) at least one source buffer has to be queued, * 2) at least one destination buffer has to be queued, * 3) streaming has to be on. * * If a queue is buffered (for example a decoder hardware ringbuffer that has * to be drained before doing streamoff), allow scheduling without v4l2 buffers * on that queue. * * There may also be additional, custom requirements. In such case the driver * should supply a custom callback (job_ready in v4l2_m2m_ops) that should * return 1 if the instance is ready. * An example of the above could be an instance that requires more than one * src/dst buffer per transaction. */ void v4l2_m2m_try_schedule(struct v4l2_m2m_ctx *m2m_ctx); /** * v4l2_m2m_job_finish() - inform the framework that a job has been finished * and have it clean up * * @m2m_dev: opaque pointer to the internal data to handle M2M context * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx * * Called by a driver to yield back the device after it has finished with it. * Should be called as soon as possible after reaching a state which allows * other instances to take control of the device. * * This function has to be called only after &v4l2_m2m_ops->device_run * callback has been called on the driver. To prevent recursion, it should * not be called directly from the &v4l2_m2m_ops->device_run callback though. */ void v4l2_m2m_job_finish(struct v4l2_m2m_dev *m2m_dev, struct v4l2_m2m_ctx *m2m_ctx); /** * v4l2_m2m_buf_done_and_job_finish() - return source/destination buffers with * state and inform the framework that a job has been finished and have it * clean up * * @m2m_dev: opaque pointer to the internal data to handle M2M context * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx * @state: vb2 buffer state passed to v4l2_m2m_buf_done(). * * Drivers that set V4L2_BUF_CAP_SUPPORTS_M2M_HOLD_CAPTURE_BUF must use this * function instead of job_finish() to take held buffers into account. It is * optional for other drivers. * * This function removes the source buffer from the ready list and returns * it with the given state. The same is done for the destination buffer, unless * it is marked 'held'. In that case the buffer is kept on the ready list. * * After that the job is finished (see job_finish()). * * This allows for multiple output buffers to be used to fill in a single * capture buffer. This is typically used by stateless decoders where * multiple e.g. H.264 slices contribute to a single decoded frame. */ void v4l2_m2m_buf_done_and_job_finish(struct v4l2_m2m_dev *m2m_dev, struct v4l2_m2m_ctx *m2m_ctx, enum vb2_buffer_state state); static inline void v4l2_m2m_buf_done(struct vb2_v4l2_buffer *buf, enum vb2_buffer_state state) { vb2_buffer_done(&buf->vb2_buf, state); } /** * v4l2_m2m_clear_state() - clear encoding/decoding state * * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx */ static inline void v4l2_m2m_clear_state(struct v4l2_m2m_ctx *m2m_ctx) { m2m_ctx->next_buf_last = false; m2m_ctx->is_draining = false; m2m_ctx->has_stopped = false; } /** * v4l2_m2m_mark_stopped() - set current encoding/decoding state as stopped * * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx */ static inline void v4l2_m2m_mark_stopped(struct v4l2_m2m_ctx *m2m_ctx) { m2m_ctx->next_buf_last = false; m2m_ctx->is_draining = false; m2m_ctx->has_stopped = true; } /** * v4l2_m2m_dst_buf_is_last() - return the current encoding/decoding session * draining management state of next queued capture buffer * * This last capture buffer should be tagged with V4L2_BUF_FLAG_LAST to notify * the end of the capture session. * * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx */ static inline bool v4l2_m2m_dst_buf_is_last(struct v4l2_m2m_ctx *m2m_ctx) { return m2m_ctx->is_draining && m2m_ctx->next_buf_last; } /** * v4l2_m2m_has_stopped() - return the current encoding/decoding session * stopped state * * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx */ static inline bool v4l2_m2m_has_stopped(struct v4l2_m2m_ctx *m2m_ctx) { return m2m_ctx->has_stopped; } /** * v4l2_m2m_is_last_draining_src_buf() - return the output buffer draining * state in the current encoding/decoding session * * This will identify the last output buffer queued before a session stop * was required, leading to an actual encoding/decoding session stop state * in the encoding/decoding process after being processed. * * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx * @vbuf: pointer to struct &v4l2_buffer */ static inline bool v4l2_m2m_is_last_draining_src_buf(struct v4l2_m2m_ctx *m2m_ctx, struct vb2_v4l2_buffer *vbuf) { return m2m_ctx->is_draining && vbuf == m2m_ctx->last_src_buf; } /** * v4l2_m2m_last_buffer_done() - marks the buffer with LAST flag and DONE * * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx * @vbuf: pointer to struct &v4l2_buffer */ void v4l2_m2m_last_buffer_done(struct v4l2_m2m_ctx *m2m_ctx, struct vb2_v4l2_buffer *vbuf); /** * v4l2_m2m_suspend() - stop new jobs from being run and wait for current job * to finish * * @m2m_dev: opaque pointer to the internal data to handle M2M context * * Called by a driver in the suspend hook. Stop new jobs from being run, and * wait for current running job to finish. */ void v4l2_m2m_suspend(struct v4l2_m2m_dev *m2m_dev); /** * v4l2_m2m_resume() - resume job running and try to run a queued job * * @m2m_dev: opaque pointer to the internal data to handle M2M context * * Called by a driver in the resume hook. This reverts the operation of * v4l2_m2m_suspend() and allows job to be run. Also try to run a queued job if * there is any. */ void v4l2_m2m_resume(struct v4l2_m2m_dev *m2m_dev); /** * v4l2_m2m_reqbufs() - multi-queue-aware REQBUFS multiplexer * * @file: pointer to struct &file * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx * @reqbufs: pointer to struct &v4l2_requestbuffers */ int v4l2_m2m_reqbufs(struct file *file, struct v4l2_m2m_ctx *m2m_ctx, struct v4l2_requestbuffers *reqbufs); /** * v4l2_m2m_querybuf() - multi-queue-aware QUERYBUF multiplexer * * @file: pointer to struct &file * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx * @buf: pointer to struct &v4l2_buffer * * See v4l2_m2m_mmap() documentation for details. */ int v4l2_m2m_querybuf(struct file *file, struct v4l2_m2m_ctx *m2m_ctx, struct v4l2_buffer *buf); /** * v4l2_m2m_qbuf() - enqueue a source or destination buffer, depending on * the type * * @file: pointer to struct &file * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx * @buf: pointer to struct &v4l2_buffer */ int v4l2_m2m_qbuf(struct file *file, struct v4l2_m2m_ctx *m2m_ctx, struct v4l2_buffer *buf); /** * v4l2_m2m_dqbuf() - dequeue a source or destination buffer, depending on * the type * * @file: pointer to struct &file * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx * @buf: pointer to struct &v4l2_buffer */ int v4l2_m2m_dqbuf(struct file *file, struct v4l2_m2m_ctx *m2m_ctx, struct v4l2_buffer *buf); /** * v4l2_m2m_prepare_buf() - prepare a source or destination buffer, depending on * the type * * @file: pointer to struct &file * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx * @buf: pointer to struct &v4l2_buffer */ int v4l2_m2m_prepare_buf(struct file *file, struct v4l2_m2m_ctx *m2m_ctx, struct v4l2_buffer *buf); /** * v4l2_m2m_create_bufs() - create a source or destination buffer, depending * on the type * * @file: pointer to struct &file * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx * @create: pointer to struct &v4l2_create_buffers */ int v4l2_m2m_create_bufs(struct file *file, struct v4l2_m2m_ctx *m2m_ctx, struct v4l2_create_buffers *create); /** * v4l2_m2m_expbuf() - export a source or destination buffer, depending on * the type * * @file: pointer to struct &file * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx * @eb: pointer to struct &v4l2_exportbuffer */ int v4l2_m2m_expbuf(struct file *file, struct v4l2_m2m_ctx *m2m_ctx, struct v4l2_exportbuffer *eb); /** * v4l2_m2m_streamon() - turn on streaming for a video queue * * @file: pointer to struct &file * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx * @type: type of the V4L2 buffer, as defined by enum &v4l2_buf_type */ int v4l2_m2m_streamon(struct file *file, struct v4l2_m2m_ctx *m2m_ctx, enum v4l2_buf_type type); /** * v4l2_m2m_streamoff() - turn off streaming for a video queue * * @file: pointer to struct &file * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx * @type: type of the V4L2 buffer, as defined by enum &v4l2_buf_type */ int v4l2_m2m_streamoff(struct file *file, struct v4l2_m2m_ctx *m2m_ctx, enum v4l2_buf_type type); /** * v4l2_m2m_update_start_streaming_state() - update the encoding/decoding * session state when a start of streaming of a video queue is requested * * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx * @q: queue */ void v4l2_m2m_update_start_streaming_state(struct v4l2_m2m_ctx *m2m_ctx, struct vb2_queue *q); /** * v4l2_m2m_update_stop_streaming_state() - update the encoding/decoding * session state when a stop of streaming of a video queue is requested * * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx * @q: queue */ void v4l2_m2m_update_stop_streaming_state(struct v4l2_m2m_ctx *m2m_ctx, struct vb2_queue *q); /** * v4l2_m2m_encoder_cmd() - execute an encoder command * * @file: pointer to struct &file * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx * @ec: pointer to the encoder command */ int v4l2_m2m_encoder_cmd(struct file *file, struct v4l2_m2m_ctx *m2m_ctx, struct v4l2_encoder_cmd *ec); /** * v4l2_m2m_decoder_cmd() - execute a decoder command * * @file: pointer to struct &file * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx * @dc: pointer to the decoder command */ int v4l2_m2m_decoder_cmd(struct file *file, struct v4l2_m2m_ctx *m2m_ctx, struct v4l2_decoder_cmd *dc); /** * v4l2_m2m_poll() - poll replacement, for destination buffers only * * @file: pointer to struct &file * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx * @wait: pointer to struct &poll_table_struct * * Call from the driver's poll() function. Will poll both queues. If a buffer * is available to dequeue (with dqbuf) from the source queue, this will * indicate that a non-blocking write can be performed, while read will be * returned in case of the destination queue. */ __poll_t v4l2_m2m_poll(struct file *file, struct v4l2_m2m_ctx *m2m_ctx, struct poll_table_struct *wait); /** * v4l2_m2m_mmap() - source and destination queues-aware mmap multiplexer * * @file: pointer to struct &file * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx * @vma: pointer to struct &vm_area_struct * * Call from driver's mmap() function. Will handle mmap() for both queues * seamlessly for the video buffer, which will receive normal per-queue offsets * and proper vb2 queue pointers. The differentiation is made outside * vb2 by adding a predefined offset to buffers from one of the queues * and subtracting it before passing it back to vb2. Only drivers (and * thus applications) receive modified offsets. */ int v4l2_m2m_mmap(struct file *file, struct v4l2_m2m_ctx *m2m_ctx, struct vm_area_struct *vma); #ifndef CONFIG_MMU unsigned long v4l2_m2m_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); #endif /** * v4l2_m2m_init() - initialize per-driver m2m data * * @m2m_ops: pointer to struct v4l2_m2m_ops * * Usually called from driver's ``probe()`` function. * * Return: returns an opaque pointer to the internal data to handle M2M context */ struct v4l2_m2m_dev *v4l2_m2m_init(const struct v4l2_m2m_ops *m2m_ops); #if defined(CONFIG_MEDIA_CONTROLLER) void v4l2_m2m_unregister_media_controller(struct v4l2_m2m_dev *m2m_dev); int v4l2_m2m_register_media_controller(struct v4l2_m2m_dev *m2m_dev, struct video_device *vdev, int function); #else static inline void v4l2_m2m_unregister_media_controller(struct v4l2_m2m_dev *m2m_dev) { } static inline int v4l2_m2m_register_media_controller(struct v4l2_m2m_dev *m2m_dev, struct video_device *vdev, int function) { return 0; } #endif /** * v4l2_m2m_release() - cleans up and frees a m2m_dev structure * * @m2m_dev: opaque pointer to the internal data to handle M2M context * * Usually called from driver's ``remove()`` function. */ void v4l2_m2m_release(struct v4l2_m2m_dev *m2m_dev); /** * v4l2_m2m_ctx_init() - allocate and initialize a m2m context * * @m2m_dev: opaque pointer to the internal data to handle M2M context * @drv_priv: driver's instance private data * @queue_init: a callback for queue type-specific initialization function * to be used for initializing vb2_queues * * Usually called from driver's ``open()`` function. */ struct v4l2_m2m_ctx *v4l2_m2m_ctx_init(struct v4l2_m2m_dev *m2m_dev, void *drv_priv, int (*queue_init)(void *priv, struct vb2_queue *src_vq, struct vb2_queue *dst_vq)); static inline void v4l2_m2m_set_src_buffered(struct v4l2_m2m_ctx *m2m_ctx, bool buffered) { m2m_ctx->out_q_ctx.buffered = buffered; } static inline void v4l2_m2m_set_dst_buffered(struct v4l2_m2m_ctx *m2m_ctx, bool buffered) { m2m_ctx->cap_q_ctx.buffered = buffered; } /** * v4l2_m2m_ctx_release() - release m2m context * * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx * * Usually called from driver's release() function. */ void v4l2_m2m_ctx_release(struct v4l2_m2m_ctx *m2m_ctx); /** * v4l2_m2m_buf_queue() - add a buffer to the proper ready buffers list. * * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx * @vbuf: pointer to struct &vb2_v4l2_buffer * * Call from vb2_queue_ops->ops->buf_queue, vb2_queue_ops callback. */ void v4l2_m2m_buf_queue(struct v4l2_m2m_ctx *m2m_ctx, struct vb2_v4l2_buffer *vbuf); /** * v4l2_m2m_num_src_bufs_ready() - return the number of source buffers ready for * use * * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx */ static inline unsigned int v4l2_m2m_num_src_bufs_ready(struct v4l2_m2m_ctx *m2m_ctx) { unsigned int num_buf_rdy; unsigned long flags; spin_lock_irqsave(&m2m_ctx->out_q_ctx.rdy_spinlock, flags); num_buf_rdy = m2m_ctx->out_q_ctx.num_rdy; spin_unlock_irqrestore(&m2m_ctx->out_q_ctx.rdy_spinlock, flags); return num_buf_rdy; } /** * v4l2_m2m_num_dst_bufs_ready() - return the number of destination buffers * ready for use * * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx */ static inline unsigned int v4l2_m2m_num_dst_bufs_ready(struct v4l2_m2m_ctx *m2m_ctx) { unsigned int num_buf_rdy; unsigned long flags; spin_lock_irqsave(&m2m_ctx->cap_q_ctx.rdy_spinlock, flags); num_buf_rdy = m2m_ctx->cap_q_ctx.num_rdy; spin_unlock_irqrestore(&m2m_ctx->cap_q_ctx.rdy_spinlock, flags); return num_buf_rdy; } /** * v4l2_m2m_next_buf() - return next buffer from the list of ready buffers * * @q_ctx: pointer to struct @v4l2_m2m_queue_ctx */ struct vb2_v4l2_buffer *v4l2_m2m_next_buf(struct v4l2_m2m_queue_ctx *q_ctx); /** * v4l2_m2m_next_src_buf() - return next source buffer from the list of ready * buffers * * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx */ static inline struct vb2_v4l2_buffer * v4l2_m2m_next_src_buf(struct v4l2_m2m_ctx *m2m_ctx) { return v4l2_m2m_next_buf(&m2m_ctx->out_q_ctx); } /** * v4l2_m2m_next_dst_buf() - return next destination buffer from the list of * ready buffers * * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx */ static inline struct vb2_v4l2_buffer * v4l2_m2m_next_dst_buf(struct v4l2_m2m_ctx *m2m_ctx) { return v4l2_m2m_next_buf(&m2m_ctx->cap_q_ctx); } /** * v4l2_m2m_last_buf() - return last buffer from the list of ready buffers * * @q_ctx: pointer to struct @v4l2_m2m_queue_ctx */ struct vb2_v4l2_buffer *v4l2_m2m_last_buf(struct v4l2_m2m_queue_ctx *q_ctx); /** * v4l2_m2m_last_src_buf() - return last source buffer from the list of * ready buffers * * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx */ static inline struct vb2_v4l2_buffer * v4l2_m2m_last_src_buf(struct v4l2_m2m_ctx *m2m_ctx) { return v4l2_m2m_last_buf(&m2m_ctx->out_q_ctx); } /** * v4l2_m2m_last_dst_buf() - return last destination buffer from the list of * ready buffers * * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx */ static inline struct vb2_v4l2_buffer * v4l2_m2m_last_dst_buf(struct v4l2_m2m_ctx *m2m_ctx) { return v4l2_m2m_last_buf(&m2m_ctx->cap_q_ctx); } /** * v4l2_m2m_for_each_dst_buf() - iterate over a list of destination ready * buffers * * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx * @b: current buffer of type struct v4l2_m2m_buffer */ #define v4l2_m2m_for_each_dst_buf(m2m_ctx, b) \ list_for_each_entry(b, &m2m_ctx->cap_q_ctx.rdy_queue, list) /** * v4l2_m2m_for_each_src_buf() - iterate over a list of source ready buffers * * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx * @b: current buffer of type struct v4l2_m2m_buffer */ #define v4l2_m2m_for_each_src_buf(m2m_ctx, b) \ list_for_each_entry(b, &m2m_ctx->out_q_ctx.rdy_queue, list) /** * v4l2_m2m_for_each_dst_buf_safe() - iterate over a list of destination ready * buffers safely * * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx * @b: current buffer of type struct v4l2_m2m_buffer * @n: used as temporary storage */ #define v4l2_m2m_for_each_dst_buf_safe(m2m_ctx, b, n) \ list_for_each_entry_safe(b, n, &m2m_ctx->cap_q_ctx.rdy_queue, list) /** * v4l2_m2m_for_each_src_buf_safe() - iterate over a list of source ready * buffers safely * * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx * @b: current buffer of type struct v4l2_m2m_buffer * @n: used as temporary storage */ #define v4l2_m2m_for_each_src_buf_safe(m2m_ctx, b, n) \ list_for_each_entry_safe(b, n, &m2m_ctx->out_q_ctx.rdy_queue, list) /** * v4l2_m2m_get_src_vq() - return vb2_queue for source buffers * * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx */ static inline struct vb2_queue *v4l2_m2m_get_src_vq(struct v4l2_m2m_ctx *m2m_ctx) { return &m2m_ctx->out_q_ctx.q; } /** * v4l2_m2m_get_dst_vq() - return vb2_queue for destination buffers * * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx */ static inline struct vb2_queue *v4l2_m2m_get_dst_vq(struct v4l2_m2m_ctx *m2m_ctx) { return &m2m_ctx->cap_q_ctx.q; } /** * v4l2_m2m_buf_remove() - take off a buffer from the list of ready buffers and * return it * * @q_ctx: pointer to struct @v4l2_m2m_queue_ctx */ struct vb2_v4l2_buffer *v4l2_m2m_buf_remove(struct v4l2_m2m_queue_ctx *q_ctx); /** * v4l2_m2m_src_buf_remove() - take off a source buffer from the list of ready * buffers and return it * * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx */ static inline struct vb2_v4l2_buffer * v4l2_m2m_src_buf_remove(struct v4l2_m2m_ctx *m2m_ctx) { return v4l2_m2m_buf_remove(&m2m_ctx->out_q_ctx); } /** * v4l2_m2m_dst_buf_remove() - take off a destination buffer from the list of * ready buffers and return it * * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx */ static inline struct vb2_v4l2_buffer * v4l2_m2m_dst_buf_remove(struct v4l2_m2m_ctx *m2m_ctx) { return v4l2_m2m_buf_remove(&m2m_ctx->cap_q_ctx); } /** * v4l2_m2m_buf_remove_by_buf() - take off exact buffer from the list of ready * buffers * * @q_ctx: pointer to struct @v4l2_m2m_queue_ctx * @vbuf: the buffer to be removed */ void v4l2_m2m_buf_remove_by_buf(struct v4l2_m2m_queue_ctx *q_ctx, struct vb2_v4l2_buffer *vbuf); /** * v4l2_m2m_src_buf_remove_by_buf() - take off exact source buffer from the list * of ready buffers * * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx * @vbuf: the buffer to be removed */ static inline void v4l2_m2m_src_buf_remove_by_buf(struct v4l2_m2m_ctx *m2m_ctx, struct vb2_v4l2_buffer *vbuf) { v4l2_m2m_buf_remove_by_buf(&m2m_ctx->out_q_ctx, vbuf); } /** * v4l2_m2m_dst_buf_remove_by_buf() - take off exact destination buffer from the * list of ready buffers * * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx * @vbuf: the buffer to be removed */ static inline void v4l2_m2m_dst_buf_remove_by_buf(struct v4l2_m2m_ctx *m2m_ctx, struct vb2_v4l2_buffer *vbuf) { v4l2_m2m_buf_remove_by_buf(&m2m_ctx->cap_q_ctx, vbuf); } struct vb2_v4l2_buffer * v4l2_m2m_buf_remove_by_idx(struct v4l2_m2m_queue_ctx *q_ctx, unsigned int idx); static inline struct vb2_v4l2_buffer * v4l2_m2m_src_buf_remove_by_idx(struct v4l2_m2m_ctx *m2m_ctx, unsigned int idx) { return v4l2_m2m_buf_remove_by_idx(&m2m_ctx->out_q_ctx, idx); } static inline struct vb2_v4l2_buffer * v4l2_m2m_dst_buf_remove_by_idx(struct v4l2_m2m_ctx *m2m_ctx, unsigned int idx) { return v4l2_m2m_buf_remove_by_idx(&m2m_ctx->cap_q_ctx, idx); } /** * v4l2_m2m_buf_copy_metadata() - copy buffer metadata from * the output buffer to the capture buffer * * @out_vb: the output buffer that is the source of the metadata. * @cap_vb: the capture buffer that will receive the metadata. * @copy_frame_flags: copy the KEY/B/PFRAME flags as well. * * This helper function copies the timestamp, timecode (if the TIMECODE * buffer flag was set), field and the TIMECODE, KEYFRAME, BFRAME, PFRAME * and TSTAMP_SRC_MASK flags from @out_vb to @cap_vb. * * If @copy_frame_flags is false, then the KEYFRAME, BFRAME and PFRAME * flags are not copied. This is typically needed for encoders that * set this bits explicitly. */ void v4l2_m2m_buf_copy_metadata(const struct vb2_v4l2_buffer *out_vb, struct vb2_v4l2_buffer *cap_vb, bool copy_frame_flags); /* v4l2 request helper */ void v4l2_m2m_request_queue(struct media_request *req); /* v4l2 ioctl helpers */ int v4l2_m2m_ioctl_reqbufs(struct file *file, void *priv, struct v4l2_requestbuffers *rb); int v4l2_m2m_ioctl_create_bufs(struct file *file, void *fh, struct v4l2_create_buffers *create); int v4l2_m2m_ioctl_querybuf(struct file *file, void *fh, struct v4l2_buffer *buf); int v4l2_m2m_ioctl_expbuf(struct file *file, void *fh, struct v4l2_exportbuffer *eb); int v4l2_m2m_ioctl_qbuf(struct file *file, void *fh, struct v4l2_buffer *buf); int v4l2_m2m_ioctl_dqbuf(struct file *file, void *fh, struct v4l2_buffer *buf); int v4l2_m2m_ioctl_prepare_buf(struct file *file, void *fh, struct v4l2_buffer *buf); int v4l2_m2m_ioctl_streamon(struct file *file, void *fh, enum v4l2_buf_type type); int v4l2_m2m_ioctl_streamoff(struct file *file, void *fh, enum v4l2_buf_type type); int v4l2_m2m_ioctl_encoder_cmd(struct file *file, void *fh, struct v4l2_encoder_cmd *ec); int v4l2_m2m_ioctl_decoder_cmd(struct file *file, void *fh, struct v4l2_decoder_cmd *dc); int v4l2_m2m_ioctl_try_encoder_cmd(struct file *file, void *fh, struct v4l2_encoder_cmd *ec); int v4l2_m2m_ioctl_try_decoder_cmd(struct file *file, void *fh, struct v4l2_decoder_cmd *dc); int v4l2_m2m_ioctl_stateless_try_decoder_cmd(struct file *file, void *fh, struct v4l2_decoder_cmd *dc); int v4l2_m2m_ioctl_stateless_decoder_cmd(struct file *file, void *priv, struct v4l2_decoder_cmd *dc); int v4l2_m2m_fop_mmap(struct file *file, struct vm_area_struct *vma); __poll_t v4l2_m2m_fop_poll(struct file *file, poll_table *wait); #endif /* _MEDIA_V4L2_MEM2MEM_H */ |
48 48 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 | // SPDX-License-Identifier: GPL-2.0+ #include <linux/kernel.h> #include <linux/minmax.h> #include <drm/drm_blend.h> #include <drm/drm_rect.h> #include <drm/drm_fixed.h> #include "vkms_formats.h" static size_t pixel_offset(const struct vkms_frame_info *frame_info, int x, int y) { return frame_info->offset + (y * frame_info->pitch) + (x * frame_info->cpp); } /* * packed_pixels_addr - Get the pointer to pixel of a given pair of coordinates * * @frame_info: Buffer metadata * @x: The x(width) coordinate of the 2D buffer * @y: The y(Heigth) coordinate of the 2D buffer * * Takes the information stored in the frame_info, a pair of coordinates, and * returns the address of the first color channel. * This function assumes the channels are packed together, i.e. a color channel * comes immediately after another in the memory. And therefore, this function * doesn't work for YUV with chroma subsampling (e.g. YUV420 and NV21). */ static void *packed_pixels_addr(const struct vkms_frame_info *frame_info, int x, int y) { size_t offset = pixel_offset(frame_info, x, y); return (u8 *)frame_info->map[0].vaddr + offset; } static void *get_packed_src_addr(const struct vkms_frame_info *frame_info, int y) { int x_src = frame_info->src.x1 >> 16; int y_src = y - frame_info->rotated.y1 + (frame_info->src.y1 >> 16); return packed_pixels_addr(frame_info, x_src, y_src); } static int get_x_position(const struct vkms_frame_info *frame_info, int limit, int x) { if (frame_info->rotation & (DRM_MODE_REFLECT_X | DRM_MODE_ROTATE_270)) return limit - x - 1; return x; } static void ARGB8888_to_argb_u16(u8 *src_pixels, struct pixel_argb_u16 *out_pixel) { /* * The 257 is the "conversion ratio". This number is obtained by the * (2^16 - 1) / (2^8 - 1) division. Which, in this case, tries to get * the best color value in a pixel format with more possibilities. * A similar idea applies to others RGB color conversions. */ out_pixel->a = (u16)src_pixels[3] * 257; out_pixel->r = (u16)src_pixels[2] * 257; out_pixel->g = (u16)src_pixels[1] * 257; out_pixel->b = (u16)src_pixels[0] * 257; } static void XRGB8888_to_argb_u16(u8 *src_pixels, struct pixel_argb_u16 *out_pixel) { out_pixel->a = (u16)0xffff; out_pixel->r = (u16)src_pixels[2] * 257; out_pixel->g = (u16)src_pixels[1] * 257; out_pixel->b = (u16)src_pixels[0] * 257; } static void ARGB16161616_to_argb_u16(u8 *src_pixels, struct pixel_argb_u16 *out_pixel) { u16 *pixels = (u16 *)src_pixels; out_pixel->a = le16_to_cpu(pixels[3]); out_pixel->r = le16_to_cpu(pixels[2]); out_pixel->g = le16_to_cpu(pixels[1]); out_pixel->b = le16_to_cpu(pixels[0]); } static void XRGB16161616_to_argb_u16(u8 *src_pixels, struct pixel_argb_u16 *out_pixel) { u16 *pixels = (u16 *)src_pixels; out_pixel->a = (u16)0xffff; out_pixel->r = le16_to_cpu(pixels[2]); out_pixel->g = le16_to_cpu(pixels[1]); out_pixel->b = le16_to_cpu(pixels[0]); } static void RGB565_to_argb_u16(u8 *src_pixels, struct pixel_argb_u16 *out_pixel) { u16 *pixels = (u16 *)src_pixels; s64 fp_rb_ratio = drm_fixp_div(drm_int2fixp(65535), drm_int2fixp(31)); s64 fp_g_ratio = drm_fixp_div(drm_int2fixp(65535), drm_int2fixp(63)); u16 rgb_565 = le16_to_cpu(*pixels); s64 fp_r = drm_int2fixp((rgb_565 >> 11) & 0x1f); s64 fp_g = drm_int2fixp((rgb_565 >> 5) & 0x3f); s64 fp_b = drm_int2fixp(rgb_565 & 0x1f); out_pixel->a = (u16)0xffff; out_pixel->r = drm_fixp2int_round(drm_fixp_mul(fp_r, fp_rb_ratio)); out_pixel->g = drm_fixp2int_round(drm_fixp_mul(fp_g, fp_g_ratio)); out_pixel->b = drm_fixp2int_round(drm_fixp_mul(fp_b, fp_rb_ratio)); } /** * vkms_compose_row - compose a single row of a plane * @stage_buffer: output line with the composed pixels * @plane: state of the plane that is being composed * @y: y coordinate of the row * * This function composes a single row of a plane. It gets the source pixels * through the y coordinate (see get_packed_src_addr()) and goes linearly * through the source pixel, reading the pixels and converting it to * ARGB16161616 (see the pixel_read() callback). For rotate-90 and rotate-270, * the source pixels are not traversed linearly. The source pixels are queried * on each iteration in order to traverse the pixels vertically. */ void vkms_compose_row(struct line_buffer *stage_buffer, struct vkms_plane_state *plane, int y) { struct pixel_argb_u16 *out_pixels = stage_buffer->pixels; struct vkms_frame_info *frame_info = plane->frame_info; u8 *src_pixels = get_packed_src_addr(frame_info, y); int limit = min_t(size_t, drm_rect_width(&frame_info->dst), stage_buffer->n_pixels); for (size_t x = 0; x < limit; x++, src_pixels += frame_info->cpp) { int x_pos = get_x_position(frame_info, limit, x); if (drm_rotation_90_or_270(frame_info->rotation)) src_pixels = get_packed_src_addr(frame_info, x + frame_info->rotated.y1) + frame_info->cpp * y; plane->pixel_read(src_pixels, &out_pixels[x_pos]); } } /* * The following functions take an line of argb_u16 pixels from the * src_buffer, convert them to a specific format, and store them in the * destination. * * They are used in the `compose_active_planes` to convert and store a line * from the src_buffer to the writeback buffer. */ static void argb_u16_to_ARGB8888(u8 *dst_pixels, struct pixel_argb_u16 *in_pixel) { /* * This sequence below is important because the format's byte order is * in little-endian. In the case of the ARGB8888 the memory is * organized this way: * * | Addr | = blue channel * | Addr + 1 | = green channel * | Addr + 2 | = Red channel * | Addr + 3 | = Alpha channel */ dst_pixels[3] = DIV_ROUND_CLOSEST(in_pixel->a, 257); dst_pixels[2] = DIV_ROUND_CLOSEST(in_pixel->r, 257); dst_pixels[1] = DIV_ROUND_CLOSEST(in_pixel->g, 257); dst_pixels[0] = DIV_ROUND_CLOSEST(in_pixel->b, 257); } static void argb_u16_to_XRGB8888(u8 *dst_pixels, struct pixel_argb_u16 *in_pixel) { dst_pixels[3] = 0xff; dst_pixels[2] = DIV_ROUND_CLOSEST(in_pixel->r, 257); dst_pixels[1] = DIV_ROUND_CLOSEST(in_pixel->g, 257); dst_pixels[0] = DIV_ROUND_CLOSEST(in_pixel->b, 257); } static void argb_u16_to_ARGB16161616(u8 *dst_pixels, struct pixel_argb_u16 *in_pixel) { u16 *pixels = (u16 *)dst_pixels; pixels[3] = cpu_to_le16(in_pixel->a); pixels[2] = cpu_to_le16(in_pixel->r); pixels[1] = cpu_to_le16(in_pixel->g); pixels[0] = cpu_to_le16(in_pixel->b); } static void argb_u16_to_XRGB16161616(u8 *dst_pixels, struct pixel_argb_u16 *in_pixel) { u16 *pixels = (u16 *)dst_pixels; pixels[3] = 0xffff; pixels[2] = cpu_to_le16(in_pixel->r); pixels[1] = cpu_to_le16(in_pixel->g); pixels[0] = cpu_to_le16(in_pixel->b); } static void argb_u16_to_RGB565(u8 *dst_pixels, struct pixel_argb_u16 *in_pixel) { u16 *pixels = (u16 *)dst_pixels; s64 fp_rb_ratio = drm_fixp_div(drm_int2fixp(65535), drm_int2fixp(31)); s64 fp_g_ratio = drm_fixp_div(drm_int2fixp(65535), drm_int2fixp(63)); s64 fp_r = drm_int2fixp(in_pixel->r); s64 fp_g = drm_int2fixp(in_pixel->g); s64 fp_b = drm_int2fixp(in_pixel->b); u16 r = drm_fixp2int(drm_fixp_div(fp_r, fp_rb_ratio)); u16 g = drm_fixp2int(drm_fixp_div(fp_g, fp_g_ratio)); u16 b = drm_fixp2int(drm_fixp_div(fp_b, fp_rb_ratio)); *pixels = cpu_to_le16(r << 11 | g << 5 | b); } void vkms_writeback_row(struct vkms_writeback_job *wb, const struct line_buffer *src_buffer, int y) { struct vkms_frame_info *frame_info = &wb->wb_frame_info; int x_dst = frame_info->dst.x1; u8 *dst_pixels = packed_pixels_addr(frame_info, x_dst, y); struct pixel_argb_u16 *in_pixels = src_buffer->pixels; int x_limit = min_t(size_t, drm_rect_width(&frame_info->dst), src_buffer->n_pixels); for (size_t x = 0; x < x_limit; x++, dst_pixels += frame_info->cpp) wb->pixel_write(dst_pixels, &in_pixels[x]); } void *get_pixel_conversion_function(u32 format) { switch (format) { case DRM_FORMAT_ARGB8888: return &ARGB8888_to_argb_u16; case DRM_FORMAT_XRGB8888: return &XRGB8888_to_argb_u16; case DRM_FORMAT_ARGB16161616: return &ARGB16161616_to_argb_u16; case DRM_FORMAT_XRGB16161616: return &XRGB16161616_to_argb_u16; case DRM_FORMAT_RGB565: return &RGB565_to_argb_u16; default: return NULL; } } void *get_pixel_write_function(u32 format) { switch (format) { case DRM_FORMAT_ARGB8888: return &argb_u16_to_ARGB8888; case DRM_FORMAT_XRGB8888: return &argb_u16_to_XRGB8888; case DRM_FORMAT_ARGB16161616: return &argb_u16_to_ARGB16161616; case DRM_FORMAT_XRGB16161616: return &argb_u16_to_XRGB16161616; case DRM_FORMAT_RGB565: return &argb_u16_to_RGB565; default: return NULL; } } |
2198 2202 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 | // SPDX-License-Identifier: GPL-2.0 /* * Device physical location support * * Author: Won Chung <wonchung@google.com> */ #include <linux/acpi.h> #include <linux/sysfs.h> #include "physical_location.h" bool dev_add_physical_location(struct device *dev) { struct acpi_pld_info *pld; acpi_status status; if (!has_acpi_companion(dev)) return false; status = acpi_get_physical_device_location(ACPI_HANDLE(dev), &pld); if (ACPI_FAILURE(status)) return false; dev->physical_location = kzalloc(sizeof(*dev->physical_location), GFP_KERNEL); if (!dev->physical_location) { ACPI_FREE(pld); return false; } dev->physical_location->panel = pld->panel; dev->physical_location->vertical_position = pld->vertical_position; dev->physical_location->horizontal_position = pld->horizontal_position; dev->physical_location->dock = pld->dock; dev->physical_location->lid = pld->lid; ACPI_FREE(pld); return true; } static ssize_t panel_show(struct device *dev, struct device_attribute *attr, char *buf) { const char *panel; switch (dev->physical_location->panel) { case DEVICE_PANEL_TOP: panel = "top"; break; case DEVICE_PANEL_BOTTOM: panel = "bottom"; break; case DEVICE_PANEL_LEFT: panel = "left"; break; case DEVICE_PANEL_RIGHT: panel = "right"; break; case DEVICE_PANEL_FRONT: panel = "front"; break; case DEVICE_PANEL_BACK: panel = "back"; break; default: panel = "unknown"; } return sysfs_emit(buf, "%s\n", panel); } static DEVICE_ATTR_RO(panel); static ssize_t vertical_position_show(struct device *dev, struct device_attribute *attr, char *buf) { const char *vertical_position; switch (dev->physical_location->vertical_position) { case DEVICE_VERT_POS_UPPER: vertical_position = "upper"; break; case DEVICE_VERT_POS_CENTER: vertical_position = "center"; break; case DEVICE_VERT_POS_LOWER: vertical_position = "lower"; break; default: vertical_position = "unknown"; } return sysfs_emit(buf, "%s\n", vertical_position); } static DEVICE_ATTR_RO(vertical_position); static ssize_t horizontal_position_show(struct device *dev, struct device_attribute *attr, char *buf) { const char *horizontal_position; switch (dev->physical_location->horizontal_position) { case DEVICE_HORI_POS_LEFT: horizontal_position = "left"; break; case DEVICE_HORI_POS_CENTER: horizontal_position = "center"; break; case DEVICE_HORI_POS_RIGHT: horizontal_position = "right"; break; default: horizontal_position = "unknown"; } return sysfs_emit(buf, "%s\n", horizontal_position); } static DEVICE_ATTR_RO(horizontal_position); static ssize_t dock_show(struct device *dev, struct device_attribute *attr, char *buf) { return sysfs_emit(buf, "%s\n", dev->physical_location->dock ? "yes" : "no"); } static DEVICE_ATTR_RO(dock); static ssize_t lid_show(struct device *dev, struct device_attribute *attr, char *buf) { return sysfs_emit(buf, "%s\n", dev->physical_location->lid ? "yes" : "no"); } static DEVICE_ATTR_RO(lid); static struct attribute *dev_attr_physical_location[] = { &dev_attr_panel.attr, &dev_attr_vertical_position.attr, &dev_attr_horizontal_position.attr, &dev_attr_dock.attr, &dev_attr_lid.attr, NULL, }; const struct attribute_group dev_attr_physical_location_group = { .name = "physical_location", .attrs = dev_attr_physical_location, }; |
18 6 18 9829 4566 4566 8 208 200 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 | // SPDX-License-Identifier: GPL-2.0 #include <linux/err.h> #include <linux/bug.h> #include <linux/atomic.h> #include <linux/errseq.h> #include <linux/log2.h> /* * An errseq_t is a way of recording errors in one place, and allowing any * number of "subscribers" to tell whether it has changed since a previous * point where it was sampled. * * It's implemented as an unsigned 32-bit value. The low order bits are * designated to hold an error code (between 0 and -MAX_ERRNO). The upper bits * are used as a counter. This is done with atomics instead of locking so that * these functions can be called from any context. * * The general idea is for consumers to sample an errseq_t value. That value * can later be used to tell whether any new errors have occurred since that * sampling was done. * * Note that there is a risk of collisions if new errors are being recorded * frequently, since we have so few bits to use as a counter. * * To mitigate this, one bit is used as a flag to tell whether the value has * been sampled since a new value was recorded. That allows us to avoid bumping * the counter if no one has sampled it since the last time an error was * recorded. * * A new errseq_t should always be zeroed out. A errseq_t value of all zeroes * is the special (but common) case where there has never been an error. An all * zero value thus serves as the "epoch" if one wishes to know whether there * has ever been an error set since it was first initialized. */ /* The low bits are designated for error code (max of MAX_ERRNO) */ #define ERRSEQ_SHIFT ilog2(MAX_ERRNO + 1) /* This bit is used as a flag to indicate whether the value has been seen */ #define ERRSEQ_SEEN (1 << ERRSEQ_SHIFT) /* The lowest bit of the counter */ #define ERRSEQ_CTR_INC (1 << (ERRSEQ_SHIFT + 1)) /** * errseq_set - set a errseq_t for later reporting * @eseq: errseq_t field that should be set * @err: error to set (must be between -1 and -MAX_ERRNO) * * This function sets the error in @eseq, and increments the sequence counter * if the last sequence was sampled at some point in the past. * * Any error set will always overwrite an existing error. * * Return: The previous value, primarily for debugging purposes. The * return value should not be used as a previously sampled value in later * calls as it will not have the SEEN flag set. */ errseq_t errseq_set(errseq_t *eseq, int err) { errseq_t cur, old; /* MAX_ERRNO must be able to serve as a mask */ BUILD_BUG_ON_NOT_POWER_OF_2(MAX_ERRNO + 1); /* * Ensure the error code actually fits where we want it to go. If it * doesn't then just throw a warning and don't record anything. We * also don't accept zero here as that would effectively clear a * previous error. */ old = READ_ONCE(*eseq); if (WARN(unlikely(err == 0 || (unsigned int)-err > MAX_ERRNO), "err = %d\n", err)) return old; for (;;) { errseq_t new; /* Clear out error bits and set new error */ new = (old & ~(MAX_ERRNO|ERRSEQ_SEEN)) | -err; /* Only increment if someone has looked at it */ if (old & ERRSEQ_SEEN) new += ERRSEQ_CTR_INC; /* If there would be no change, then call it done */ if (new == old) { cur = new; break; } /* Try to swap the new value into place */ cur = cmpxchg(eseq, old, new); /* * Call it success if we did the swap or someone else beat us * to it for the same value. */ if (likely(cur == old || cur == new)) break; /* Raced with an update, try again */ old = cur; } return cur; } EXPORT_SYMBOL(errseq_set); /** * errseq_sample() - Grab current errseq_t value. * @eseq: Pointer to errseq_t to be sampled. * * This function allows callers to initialise their errseq_t variable. * If the error has been "seen", new callers will not see an old error. * If there is an unseen error in @eseq, the caller of this function will * see it the next time it checks for an error. * * Context: Any context. * Return: The current errseq value. */ errseq_t errseq_sample(errseq_t *eseq) { errseq_t old = READ_ONCE(*eseq); /* If nobody has seen this error yet, then we can be the first. */ if (!(old & ERRSEQ_SEEN)) old = 0; return old; } EXPORT_SYMBOL(errseq_sample); /** * errseq_check() - Has an error occurred since a particular sample point? * @eseq: Pointer to errseq_t value to be checked. * @since: Previously-sampled errseq_t from which to check. * * Grab the value that eseq points to, and see if it has changed @since * the given value was sampled. The @since value is not advanced, so there * is no need to mark the value as seen. * * Return: The latest error set in the errseq_t or 0 if it hasn't changed. */ int errseq_check(errseq_t *eseq, errseq_t since) { errseq_t cur = READ_ONCE(*eseq); if (likely(cur == since)) return 0; return -(cur & MAX_ERRNO); } EXPORT_SYMBOL(errseq_check); /** * errseq_check_and_advance() - Check an errseq_t and advance to current value. * @eseq: Pointer to value being checked and reported. * @since: Pointer to previously-sampled errseq_t to check against and advance. * * Grab the eseq value, and see whether it matches the value that @since * points to. If it does, then just return 0. * * If it doesn't, then the value has changed. Set the "seen" flag, and try to * swap it into place as the new eseq value. Then, set that value as the new * "since" value, and return whatever the error portion is set to. * * Note that no locking is provided here for concurrent updates to the "since" * value. The caller must provide that if necessary. Because of this, callers * may want to do a lockless errseq_check before taking the lock and calling * this. * * Return: Negative errno if one has been stored, or 0 if no new error has * occurred. */ int errseq_check_and_advance(errseq_t *eseq, errseq_t *since) { int err = 0; errseq_t old, new; /* * Most callers will want to use the inline wrapper to check this, * so that the common case of no error is handled without needing * to take the lock that protects the "since" value. */ old = READ_ONCE(*eseq); if (old != *since) { /* * Set the flag and try to swap it into place if it has * changed. * * We don't care about the outcome of the swap here. If the * swap doesn't occur, then it has either been updated by a * writer who is altering the value in some way (updating * counter or resetting the error), or another reader who is * just setting the "seen" flag. Either outcome is OK, and we * can advance "since" and return an error based on what we * have. */ new = old | ERRSEQ_SEEN; if (new != old) cmpxchg(eseq, old, new); *since = new; err = -(new & MAX_ERRNO); } return err; } EXPORT_SYMBOL(errseq_check_and_advance); |
13 1 12 5 1 4 4 1 1 1 1 6 1 1 1 1 2 2 1 1 13 13 1 2 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 | // SPDX-License-Identifier: GPL-2.0-or-later /* * ASIX AX8817X based USB 2.0 Ethernet Devices * Copyright (C) 2003-2006 David Hollis <dhollis@davehollis.com> * Copyright (C) 2005 Phil Chang <pchang23@sbcglobal.net> * Copyright (C) 2006 James Painter <jamie.painter@iname.com> * Copyright (c) 2002-2003 TiVo Inc. */ #include "asix.h" #define PHY_MODE_MARVELL 0x0000 #define MII_MARVELL_LED_CTRL 0x0018 #define MII_MARVELL_STATUS 0x001b #define MII_MARVELL_CTRL 0x0014 #define MARVELL_LED_MANUAL 0x0019 #define MARVELL_STATUS_HWCFG 0x0004 #define MARVELL_CTRL_TXDELAY 0x0002 #define MARVELL_CTRL_RXDELAY 0x0080 #define PHY_MODE_RTL8211CL 0x000C #define AX88772A_PHY14H 0x14 #define AX88772A_PHY14H_DEFAULT 0x442C #define AX88772A_PHY15H 0x15 #define AX88772A_PHY15H_DEFAULT 0x03C8 #define AX88772A_PHY16H 0x16 #define AX88772A_PHY16H_DEFAULT 0x4044 struct ax88172_int_data { __le16 res1; u8 link; __le16 res2; u8 status; __le16 res3; } __packed; static void asix_status(struct usbnet *dev, struct urb *urb) { struct ax88172_int_data *event; int link; if (urb->actual_length < 8) return; event = urb->transfer_buffer; link = event->link & 0x01; if (netif_carrier_ok(dev->net) != link) { usbnet_link_change(dev, link, 1); netdev_dbg(dev->net, "Link Status is: %d\n", link); } } static void asix_set_netdev_dev_addr(struct usbnet *dev, u8 *addr) { if (is_valid_ether_addr(addr)) { eth_hw_addr_set(dev->net, addr); } else { netdev_info(dev->net, "invalid hw address, using random\n"); eth_hw_addr_random(dev->net); } } /* Get the PHY Identifier from the PHYSID1 & PHYSID2 MII registers */ static u32 asix_get_phyid(struct usbnet *dev) { int phy_reg; u32 phy_id; int i; /* Poll for the rare case the FW or phy isn't ready yet. */ for (i = 0; i < 100; i++) { phy_reg = asix_mdio_read(dev->net, dev->mii.phy_id, MII_PHYSID1); if (phy_reg < 0) return 0; if (phy_reg != 0 && phy_reg != 0xFFFF) break; mdelay(1); } if (phy_reg <= 0 || phy_reg == 0xFFFF) return 0; phy_id = (phy_reg & 0xffff) << 16; phy_reg = asix_mdio_read(dev->net, dev->mii.phy_id, MII_PHYSID2); if (phy_reg < 0) return 0; phy_id |= (phy_reg & 0xffff); return phy_id; } static u32 asix_get_link(struct net_device *net) { struct usbnet *dev = netdev_priv(net); return mii_link_ok(&dev->mii); } static int asix_ioctl (struct net_device *net, struct ifreq *rq, int cmd) { struct usbnet *dev = netdev_priv(net); return generic_mii_ioctl(&dev->mii, if_mii(rq), cmd, NULL); } /* We need to override some ethtool_ops so we require our own structure so we don't interfere with other usbnet devices that may be connected at the same time. */ static const struct ethtool_ops ax88172_ethtool_ops = { .get_drvinfo = asix_get_drvinfo, .get_link = asix_get_link, .get_msglevel = usbnet_get_msglevel, .set_msglevel = usbnet_set_msglevel, .get_wol = asix_get_wol, .set_wol = asix_set_wol, .get_eeprom_len = asix_get_eeprom_len, .get_eeprom = asix_get_eeprom, .set_eeprom = asix_set_eeprom, .nway_reset = usbnet_nway_reset, .get_link_ksettings = usbnet_get_link_ksettings_mii, .set_link_ksettings = usbnet_set_link_ksettings_mii, }; static void ax88172_set_multicast(struct net_device *net) { struct usbnet *dev = netdev_priv(net); struct asix_data *data = (struct asix_data *)&dev->data; u8 rx_ctl = 0x8c; if (net->flags & IFF_PROMISC) { rx_ctl |= 0x01; } else if (net->flags & IFF_ALLMULTI || netdev_mc_count(net) > AX_MAX_MCAST) { rx_ctl |= 0x02; } else if (netdev_mc_empty(net)) { /* just broadcast and directed */ } else { /* We use the 20 byte dev->data * for our 8 byte filter buffer * to avoid allocating memory that * is tricky to free later */ struct netdev_hw_addr *ha; u32 crc_bits; memset(data->multi_filter, 0, AX_MCAST_FILTER_SIZE); /* Build the multicast hash filter. */ netdev_for_each_mc_addr(ha, net) { crc_bits = ether_crc(ETH_ALEN, ha->addr) >> 26; data->multi_filter[crc_bits >> 3] |= 1 << (crc_bits & 7); } asix_write_cmd_async(dev, AX_CMD_WRITE_MULTI_FILTER, 0, 0, AX_MCAST_FILTER_SIZE, data->multi_filter); rx_ctl |= 0x10; } asix_write_cmd_async(dev, AX_CMD_WRITE_RX_CTL, rx_ctl, 0, 0, NULL); } static int ax88172_link_reset(struct usbnet *dev) { u8 mode; struct ethtool_cmd ecmd = { .cmd = ETHTOOL_GSET }; mii_check_media(&dev->mii, 1, 1); mii_ethtool_gset(&dev->mii, &ecmd); mode = AX88172_MEDIUM_DEFAULT; if (ecmd.duplex != DUPLEX_FULL) mode |= ~AX88172_MEDIUM_FD; netdev_dbg(dev->net, "ax88172_link_reset() speed: %u duplex: %d setting mode to 0x%04x\n", ethtool_cmd_speed(&ecmd), ecmd.duplex, mode); asix_write_medium_mode(dev, mode, 0); return 0; } static const struct net_device_ops ax88172_netdev_ops = { .ndo_open = usbnet_open, .ndo_stop = usbnet_stop, .ndo_start_xmit = usbnet_start_xmit, .ndo_tx_timeout = usbnet_tx_timeout, .ndo_change_mtu = usbnet_change_mtu, .ndo_get_stats64 = dev_get_tstats64, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, .ndo_eth_ioctl = asix_ioctl, .ndo_set_rx_mode = ax88172_set_multicast, }; static void asix_phy_reset(struct usbnet *dev, unsigned int reset_bits) { unsigned int timeout = 5000; asix_mdio_write(dev->net, dev->mii.phy_id, MII_BMCR, reset_bits); /* give phy_id a chance to process reset */ udelay(500); /* See IEEE 802.3 "22.2.4.1.1 Reset": 500ms max */ while (timeout--) { if (asix_mdio_read(dev->net, dev->mii.phy_id, MII_BMCR) & BMCR_RESET) udelay(100); else return; } netdev_err(dev->net, "BMCR_RESET timeout on phy_id %d\n", dev->mii.phy_id); } static int ax88172_bind(struct usbnet *dev, struct usb_interface *intf) { int ret = 0; u8 buf[ETH_ALEN] = {0}; int i; unsigned long gpio_bits = dev->driver_info->data; usbnet_get_endpoints(dev,intf); /* Toggle the GPIOs in a manufacturer/model specific way */ for (i = 2; i >= 0; i--) { ret = asix_write_cmd(dev, AX_CMD_WRITE_GPIOS, (gpio_bits >> (i * 8)) & 0xff, 0, 0, NULL, 0); if (ret < 0) goto out; msleep(5); } ret = asix_write_rx_ctl(dev, 0x80, 0); if (ret < 0) goto out; /* Get the MAC address */ ret = asix_read_cmd(dev, AX88172_CMD_READ_NODE_ID, 0, 0, ETH_ALEN, buf, 0); if (ret < 0) { netdev_dbg(dev->net, "read AX_CMD_READ_NODE_ID failed: %d\n", ret); goto out; } asix_set_netdev_dev_addr(dev, buf); /* Initialize MII structure */ dev->mii.dev = dev->net; dev->mii.mdio_read = asix_mdio_read; dev->mii.mdio_write = asix_mdio_write; dev->mii.phy_id_mask = 0x3f; dev->mii.reg_num_mask = 0x1f; dev->mii.phy_id = asix_read_phy_addr(dev, true); if (dev->mii.phy_id < 0) return dev->mii.phy_id; dev->net->netdev_ops = &ax88172_netdev_ops; dev->net->ethtool_ops = &ax88172_ethtool_ops; dev->net->needed_headroom = 4; /* cf asix_tx_fixup() */ dev->net->needed_tailroom = 4; /* cf asix_tx_fixup() */ asix_phy_reset(dev, BMCR_RESET); asix_mdio_write(dev->net, dev->mii.phy_id, MII_ADVERTISE, ADVERTISE_ALL | ADVERTISE_CSMA | ADVERTISE_PAUSE_CAP); mii_nway_restart(&dev->mii); return 0; out: return ret; } static void ax88772_ethtool_get_strings(struct net_device *netdev, u32 sset, u8 *data) { switch (sset) { case ETH_SS_TEST: net_selftest_get_strings(data); break; } } static int ax88772_ethtool_get_sset_count(struct net_device *ndev, int sset) { switch (sset) { case ETH_SS_TEST: return net_selftest_get_count(); default: return -EOPNOTSUPP; } } static void ax88772_ethtool_get_pauseparam(struct net_device *ndev, struct ethtool_pauseparam *pause) { struct usbnet *dev = netdev_priv(ndev); struct asix_common_private *priv = dev->driver_priv; phylink_ethtool_get_pauseparam(priv->phylink, pause); } static int ax88772_ethtool_set_pauseparam(struct net_device *ndev, struct ethtool_pauseparam *pause) { struct usbnet *dev = netdev_priv(ndev); struct asix_common_private *priv = dev->driver_priv; return phylink_ethtool_set_pauseparam(priv->phylink, pause); } static const struct ethtool_ops ax88772_ethtool_ops = { .get_drvinfo = asix_get_drvinfo, .get_link = usbnet_get_link, .get_msglevel = usbnet_get_msglevel, .set_msglevel = usbnet_set_msglevel, .get_wol = asix_get_wol, .set_wol = asix_set_wol, .get_eeprom_len = asix_get_eeprom_len, .get_eeprom = asix_get_eeprom, .set_eeprom = asix_set_eeprom, .nway_reset = phy_ethtool_nway_reset, .get_link_ksettings = phy_ethtool_get_link_ksettings, .set_link_ksettings = phy_ethtool_set_link_ksettings, .self_test = net_selftest, .get_strings = ax88772_ethtool_get_strings, .get_sset_count = ax88772_ethtool_get_sset_count, .get_pauseparam = ax88772_ethtool_get_pauseparam, .set_pauseparam = ax88772_ethtool_set_pauseparam, }; static int ax88772_reset(struct usbnet *dev) { struct asix_data *data = (struct asix_data *)&dev->data; struct asix_common_private *priv = dev->driver_priv; int ret; /* Rewrite MAC address */ ether_addr_copy(data->mac_addr, dev->net->dev_addr); ret = asix_write_cmd(dev, AX_CMD_WRITE_NODE_ID, 0, 0, ETH_ALEN, data->mac_addr, 0); if (ret < 0) goto out; /* Set RX_CTL to default values with 2k buffer, and enable cactus */ ret = asix_write_rx_ctl(dev, AX_DEFAULT_RX_CTL, 0); if (ret < 0) goto out; ret = asix_write_medium_mode(dev, AX88772_MEDIUM_DEFAULT, 0); if (ret < 0) goto out; phylink_start(priv->phylink); return 0; out: return ret; } static int ax88772_hw_reset(struct usbnet *dev, int in_pm) { struct asix_data *data = (struct asix_data *)&dev->data; struct asix_common_private *priv = dev->driver_priv; u16 rx_ctl; int ret; ret = asix_write_gpio(dev, AX_GPIO_RSE | AX_GPIO_GPO_2 | AX_GPIO_GPO2EN, 5, in_pm); if (ret < 0) goto out; ret = asix_write_cmd(dev, AX_CMD_SW_PHY_SELECT, priv->embd_phy, 0, 0, NULL, in_pm); if (ret < 0) { netdev_dbg(dev->net, "Select PHY #1 failed: %d\n", ret); goto out; } if (priv->embd_phy) { ret = asix_sw_reset(dev, AX_SWRESET_IPPD, in_pm); if (ret < 0) goto out; usleep_range(10000, 11000); ret = asix_sw_reset(dev, AX_SWRESET_CLEAR, in_pm); if (ret < 0) goto out; msleep(60); ret = asix_sw_reset(dev, AX_SWRESET_IPRL | AX_SWRESET_PRL, in_pm); if (ret < 0) goto out; } else { ret = asix_sw_reset(dev, AX_SWRESET_IPPD | AX_SWRESET_PRL, in_pm); if (ret < 0) goto out; } msleep(150); if (in_pm && (!asix_mdio_read_nopm(dev->net, dev->mii.phy_id, MII_PHYSID1))){ ret = -EIO; goto out; } ret = asix_write_rx_ctl(dev, AX_DEFAULT_RX_CTL, in_pm); if (ret < 0) goto out; ret = asix_write_medium_mode(dev, AX88772_MEDIUM_DEFAULT, in_pm); if (ret < 0) goto out; ret = asix_write_cmd(dev, AX_CMD_WRITE_IPG0, AX88772_IPG0_DEFAULT | AX88772_IPG1_DEFAULT, AX88772_IPG2_DEFAULT, 0, NULL, in_pm); if (ret < 0) { netdev_dbg(dev->net, "Write IPG,IPG1,IPG2 failed: %d\n", ret); goto out; } /* Rewrite MAC address */ ether_addr_copy(data->mac_addr, dev->net->dev_addr); ret = asix_write_cmd(dev, AX_CMD_WRITE_NODE_ID, 0, 0, ETH_ALEN, data->mac_addr, in_pm); if (ret < 0) goto out; /* Set RX_CTL to default values with 2k buffer, and enable cactus */ ret = asix_write_rx_ctl(dev, AX_DEFAULT_RX_CTL, in_pm); if (ret < 0) goto out; rx_ctl = asix_read_rx_ctl(dev, in_pm); netdev_dbg(dev->net, "RX_CTL is 0x%04x after all initializations\n", rx_ctl); rx_ctl = asix_read_medium_status(dev, in_pm); netdev_dbg(dev->net, "Medium Status is 0x%04x after all initializations\n", rx_ctl); return 0; out: return ret; } static int ax88772a_hw_reset(struct usbnet *dev, int in_pm) { struct asix_data *data = (struct asix_data *)&dev->data; struct asix_common_private *priv = dev->driver_priv; u16 rx_ctl, phy14h, phy15h, phy16h; int ret; ret = asix_write_gpio(dev, AX_GPIO_RSE, 5, in_pm); if (ret < 0) goto out; ret = asix_write_cmd(dev, AX_CMD_SW_PHY_SELECT, priv->embd_phy | AX_PHYSEL_SSEN, 0, 0, NULL, in_pm); if (ret < 0) { netdev_dbg(dev->net, "Select PHY #1 failed: %d\n", ret); goto out; } usleep_range(10000, 11000); ret = asix_sw_reset(dev, AX_SWRESET_IPPD | AX_SWRESET_IPRL, in_pm); if (ret < 0) goto out; usleep_range(10000, 11000); ret = asix_sw_reset(dev, AX_SWRESET_IPRL, in_pm); if (ret < 0) goto out; msleep(160); ret = asix_sw_reset(dev, AX_SWRESET_CLEAR, in_pm); if (ret < 0) goto out; ret = asix_sw_reset(dev, AX_SWRESET_IPRL, in_pm); if (ret < 0) goto out; msleep(200); if (in_pm && (!asix_mdio_read_nopm(dev->net, dev->mii.phy_id, MII_PHYSID1))) { ret = -1; goto out; } if (priv->chipcode == AX_AX88772B_CHIPCODE) { ret = asix_write_cmd(dev, AX_QCTCTRL, 0x8000, 0x8001, 0, NULL, in_pm); if (ret < 0) { netdev_dbg(dev->net, "Write BQ setting failed: %d\n", ret); goto out; } } else if (priv->chipcode == AX_AX88772A_CHIPCODE) { /* Check if the PHY registers have default settings */ phy14h = asix_mdio_read_nopm(dev->net, dev->mii.phy_id, AX88772A_PHY14H); phy15h = asix_mdio_read_nopm(dev->net, dev->mii.phy_id, AX88772A_PHY15H); phy16h = asix_mdio_read_nopm(dev->net, dev->mii.phy_id, AX88772A_PHY16H); netdev_dbg(dev->net, "772a_hw_reset: MR20=0x%x MR21=0x%x MR22=0x%x\n", phy14h, phy15h, phy16h); /* Restore PHY registers default setting if not */ if (phy14h != AX88772A_PHY14H_DEFAULT) asix_mdio_write_nopm(dev->net, dev->mii.phy_id, AX88772A_PHY14H, AX88772A_PHY14H_DEFAULT); if (phy15h != AX88772A_PHY15H_DEFAULT) asix_mdio_write_nopm(dev->net, dev->mii.phy_id, AX88772A_PHY15H, AX88772A_PHY15H_DEFAULT); if (phy16h != AX88772A_PHY16H_DEFAULT) asix_mdio_write_nopm(dev->net, dev->mii.phy_id, AX88772A_PHY16H, AX88772A_PHY16H_DEFAULT); } ret = asix_write_cmd(dev, AX_CMD_WRITE_IPG0, AX88772_IPG0_DEFAULT | AX88772_IPG1_DEFAULT, AX88772_IPG2_DEFAULT, 0, NULL, in_pm); if (ret < 0) { netdev_dbg(dev->net, "Write IPG,IPG1,IPG2 failed: %d\n", ret); goto out; } /* Rewrite MAC address */ memcpy(data->mac_addr, dev->net->dev_addr, ETH_ALEN); ret = asix_write_cmd(dev, AX_CMD_WRITE_NODE_ID, 0, 0, ETH_ALEN, data->mac_addr, in_pm); if (ret < 0) goto out; /* Set RX_CTL to default values with 2k buffer, and enable cactus */ ret = asix_write_rx_ctl(dev, AX_DEFAULT_RX_CTL, in_pm); if (ret < 0) goto out; ret = asix_write_medium_mode(dev, AX88772_MEDIUM_DEFAULT, in_pm); if (ret < 0) return ret; /* Set RX_CTL to default values with 2k buffer, and enable cactus */ ret = asix_write_rx_ctl(dev, AX_DEFAULT_RX_CTL, in_pm); if (ret < 0) goto out; rx_ctl = asix_read_rx_ctl(dev, in_pm); netdev_dbg(dev->net, "RX_CTL is 0x%04x after all initializations\n", rx_ctl); rx_ctl = asix_read_medium_status(dev, in_pm); netdev_dbg(dev->net, "Medium Status is 0x%04x after all initializations\n", rx_ctl); return 0; out: return ret; } static const struct net_device_ops ax88772_netdev_ops = { .ndo_open = usbnet_open, .ndo_stop = usbnet_stop, .ndo_start_xmit = usbnet_start_xmit, .ndo_tx_timeout = usbnet_tx_timeout, .ndo_change_mtu = usbnet_change_mtu, .ndo_get_stats64 = dev_get_tstats64, .ndo_set_mac_address = asix_set_mac_address, .ndo_validate_addr = eth_validate_addr, .ndo_eth_ioctl = phy_do_ioctl_running, .ndo_set_rx_mode = asix_set_multicast, }; static void ax88772_suspend(struct usbnet *dev) { struct asix_common_private *priv = dev->driver_priv; u16 medium; if (netif_running(dev->net)) { rtnl_lock(); phylink_suspend(priv->phylink, false); rtnl_unlock(); } /* Stop MAC operation */ medium = asix_read_medium_status(dev, 1); medium &= ~AX_MEDIUM_RE; asix_write_medium_mode(dev, medium, 1); netdev_dbg(dev->net, "ax88772_suspend: medium=0x%04x\n", asix_read_medium_status(dev, 1)); } static int asix_suspend(struct usb_interface *intf, pm_message_t message) { struct usbnet *dev = usb_get_intfdata(intf); struct asix_common_private *priv = dev->driver_priv; if (priv && priv->suspend) priv->suspend(dev); return usbnet_suspend(intf, message); } static void ax88772_resume(struct usbnet *dev) { struct asix_common_private *priv = dev->driver_priv; int i; for (i = 0; i < 3; i++) if (!priv->reset(dev, 1)) break; if (netif_running(dev->net)) { rtnl_lock(); phylink_resume(priv->phylink); rtnl_unlock(); } } static int asix_resume(struct usb_interface *intf) { struct usbnet *dev = usb_get_intfdata(intf); struct asix_common_private *priv = dev->driver_priv; if (priv && priv->resume) priv->resume(dev); return usbnet_resume(intf); } static int ax88772_init_mdio(struct usbnet *dev) { struct asix_common_private *priv = dev->driver_priv; int ret; priv->mdio = mdiobus_alloc(); if (!priv->mdio) return -ENOMEM; priv->mdio->priv = dev; priv->mdio->read = &asix_mdio_bus_read; priv->mdio->write = &asix_mdio_bus_write; priv->mdio->name = "Asix MDIO Bus"; /* mii bus name is usb-<usb bus number>-<usb device number> */ snprintf(priv->mdio->id, MII_BUS_ID_SIZE, "usb-%03d:%03d", dev->udev->bus->busnum, dev->udev->devnum); ret = mdiobus_register(priv->mdio); if (ret) { netdev_err(dev->net, "Could not register MDIO bus (err %d)\n", ret); mdiobus_free(priv->mdio); priv->mdio = NULL; } return ret; } static void ax88772_mdio_unregister(struct asix_common_private *priv) { mdiobus_unregister(priv->mdio); mdiobus_free(priv->mdio); } static int ax88772_init_phy(struct usbnet *dev) { struct asix_common_private *priv = dev->driver_priv; int ret; priv->phydev = mdiobus_get_phy(priv->mdio, priv->phy_addr); if (!priv->phydev) { netdev_err(dev->net, "Could not find PHY\n"); return -ENODEV; } ret = phylink_connect_phy(priv->phylink, priv->phydev); if (ret) { netdev_err(dev->net, "Could not connect PHY\n"); return ret; } phy_suspend(priv->phydev); priv->phydev->mac_managed_pm = true; phy_attached_info(priv->phydev); if (priv->embd_phy) return 0; /* In case main PHY is not the embedded PHY and MAC is RMII clock * provider, we need to suspend embedded PHY by keeping PLL enabled * (AX_SWRESET_IPPD == 0). */ priv->phydev_int = mdiobus_get_phy(priv->mdio, AX_EMBD_PHY_ADDR); if (!priv->phydev_int) { rtnl_lock(); phylink_disconnect_phy(priv->phylink); rtnl_unlock(); netdev_err(dev->net, "Could not find internal PHY\n"); return -ENODEV; } priv->phydev_int->mac_managed_pm = true; phy_suspend(priv->phydev_int); return 0; } static void ax88772_mac_config(struct phylink_config *config, unsigned int mode, const struct phylink_link_state *state) { /* Nothing to do */ } static void ax88772_mac_link_down(struct phylink_config *config, unsigned int mode, phy_interface_t interface) { struct usbnet *dev = netdev_priv(to_net_dev(config->dev)); asix_write_medium_mode(dev, 0, 0); usbnet_link_change(dev, false, false); } static void ax88772_mac_link_up(struct phylink_config *config, struct phy_device *phy, unsigned int mode, phy_interface_t interface, int speed, int duplex, bool tx_pause, bool rx_pause) { struct usbnet *dev = netdev_priv(to_net_dev(config->dev)); u16 m = AX_MEDIUM_AC | AX_MEDIUM_RE; m |= duplex ? AX_MEDIUM_FD : 0; switch (speed) { case SPEED_100: m |= AX_MEDIUM_PS; break; case SPEED_10: break; default: return; } if (tx_pause) m |= AX_MEDIUM_TFC; if (rx_pause) m |= AX_MEDIUM_RFC; asix_write_medium_mode(dev, m, 0); usbnet_link_change(dev, true, false); } static const struct phylink_mac_ops ax88772_phylink_mac_ops = { .mac_config = ax88772_mac_config, .mac_link_down = ax88772_mac_link_down, .mac_link_up = ax88772_mac_link_up, }; static int ax88772_phylink_setup(struct usbnet *dev) { struct asix_common_private *priv = dev->driver_priv; phy_interface_t phy_if_mode; struct phylink *phylink; priv->phylink_config.dev = &dev->net->dev; priv->phylink_config.type = PHYLINK_NETDEV; priv->phylink_config.mac_capabilities = MAC_SYM_PAUSE | MAC_ASYM_PAUSE | MAC_10 | MAC_100; __set_bit(PHY_INTERFACE_MODE_INTERNAL, priv->phylink_config.supported_interfaces); __set_bit(PHY_INTERFACE_MODE_RMII, priv->phylink_config.supported_interfaces); if (priv->embd_phy) phy_if_mode = PHY_INTERFACE_MODE_INTERNAL; else phy_if_mode = PHY_INTERFACE_MODE_RMII; phylink = phylink_create(&priv->phylink_config, dev->net->dev.fwnode, phy_if_mode, &ax88772_phylink_mac_ops); if (IS_ERR(phylink)) return PTR_ERR(phylink); priv->phylink = phylink; return 0; } static int ax88772_bind(struct usbnet *dev, struct usb_interface *intf) { struct asix_common_private *priv; u8 buf[ETH_ALEN] = {0}; int ret, i; priv = devm_kzalloc(&dev->udev->dev, sizeof(*priv), GFP_KERNEL); if (!priv) return -ENOMEM; dev->driver_priv = priv; usbnet_get_endpoints(dev, intf); /* Maybe the boot loader passed the MAC address via device tree */ if (!eth_platform_get_mac_address(&dev->udev->dev, buf)) { netif_dbg(dev, ifup, dev->net, "MAC address read from device tree"); } else { /* Try getting the MAC address from EEPROM */ if (dev->driver_info->data & FLAG_EEPROM_MAC) { for (i = 0; i < (ETH_ALEN >> 1); i++) { ret = asix_read_cmd(dev, AX_CMD_READ_EEPROM, 0x04 + i, 0, 2, buf + i * 2, 0); if (ret < 0) break; } } else { ret = asix_read_cmd(dev, AX_CMD_READ_NODE_ID, 0, 0, ETH_ALEN, buf, 0); } if (ret < 0) { netdev_dbg(dev->net, "Failed to read MAC address: %d\n", ret); return ret; } } asix_set_netdev_dev_addr(dev, buf); dev->net->netdev_ops = &ax88772_netdev_ops; dev->net->ethtool_ops = &ax88772_ethtool_ops; dev->net->needed_headroom = 4; /* cf asix_tx_fixup() */ dev->net->needed_tailroom = 4; /* cf asix_tx_fixup() */ ret = asix_read_phy_addr(dev, true); if (ret < 0) return ret; priv->phy_addr = ret; priv->embd_phy = ((priv->phy_addr & 0x1f) == AX_EMBD_PHY_ADDR); ret = asix_read_cmd(dev, AX_CMD_STATMNGSTS_REG, 0, 0, 1, &priv->chipcode, 0); if (ret < 0) { netdev_dbg(dev->net, "Failed to read STATMNGSTS_REG: %d\n", ret); return ret; } priv->chipcode &= AX_CHIPCODE_MASK; priv->resume = ax88772_resume; priv->suspend = ax88772_suspend; if (priv->chipcode == AX_AX88772_CHIPCODE) priv->reset = ax88772_hw_reset; else priv->reset = ax88772a_hw_reset; ret = priv->reset(dev, 0); if (ret < 0) { netdev_dbg(dev->net, "Failed to reset AX88772: %d\n", ret); return ret; } /* Asix framing packs multiple eth frames into a 2K usb bulk transfer */ if (dev->driver_info->flags & FLAG_FRAMING_AX) { /* hard_mtu is still the default - the device does not support jumbo eth frames */ dev->rx_urb_size = 2048; } priv->presvd_phy_bmcr = 0; priv->presvd_phy_advertise = 0; ret = ax88772_init_mdio(dev); if (ret) goto mdio_err; ret = ax88772_phylink_setup(dev); if (ret) goto phylink_err; ret = ax88772_init_phy(dev); if (ret) goto initphy_err; return 0; initphy_err: phylink_destroy(priv->phylink); phylink_err: ax88772_mdio_unregister(priv); mdio_err: return ret; } static int ax88772_stop(struct usbnet *dev) { struct asix_common_private *priv = dev->driver_priv; phylink_stop(priv->phylink); return 0; } static void ax88772_unbind(struct usbnet *dev, struct usb_interface *intf) { struct asix_common_private *priv = dev->driver_priv; rtnl_lock(); phylink_disconnect_phy(priv->phylink); rtnl_unlock(); phylink_destroy(priv->phylink); ax88772_mdio_unregister(priv); asix_rx_fixup_common_free(dev->driver_priv); } static void ax88178_unbind(struct usbnet *dev, struct usb_interface *intf) { asix_rx_fixup_common_free(dev->driver_priv); kfree(dev->driver_priv); } static const struct ethtool_ops ax88178_ethtool_ops = { .get_drvinfo = asix_get_drvinfo, .get_link = asix_get_link, .get_msglevel = usbnet_get_msglevel, .set_msglevel = usbnet_set_msglevel, .get_wol = asix_get_wol, .set_wol = asix_set_wol, .get_eeprom_len = asix_get_eeprom_len, .get_eeprom = asix_get_eeprom, .set_eeprom = asix_set_eeprom, .nway_reset = usbnet_nway_reset, .get_link_ksettings = usbnet_get_link_ksettings_mii, .set_link_ksettings = usbnet_set_link_ksettings_mii, }; static int marvell_phy_init(struct usbnet *dev) { struct asix_data *data = (struct asix_data *)&dev->data; u16 reg; netdev_dbg(dev->net, "marvell_phy_init()\n"); reg = asix_mdio_read(dev->net, dev->mii.phy_id, MII_MARVELL_STATUS); netdev_dbg(dev->net, "MII_MARVELL_STATUS = 0x%04x\n", reg); asix_mdio_write(dev->net, dev->mii.phy_id, MII_MARVELL_CTRL, MARVELL_CTRL_RXDELAY | MARVELL_CTRL_TXDELAY); if (data->ledmode) { reg = asix_mdio_read(dev->net, dev->mii.phy_id, MII_MARVELL_LED_CTRL); netdev_dbg(dev->net, "MII_MARVELL_LED_CTRL (1) = 0x%04x\n", reg); reg &= 0xf8ff; reg |= (1 + 0x0100); asix_mdio_write(dev->net, dev->mii.phy_id, MII_MARVELL_LED_CTRL, reg); reg = asix_mdio_read(dev->net, dev->mii.phy_id, MII_MARVELL_LED_CTRL); netdev_dbg(dev->net, "MII_MARVELL_LED_CTRL (2) = 0x%04x\n", reg); } return 0; } static int rtl8211cl_phy_init(struct usbnet *dev) { struct asix_data *data = (struct asix_data *)&dev->data; netdev_dbg(dev->net, "rtl8211cl_phy_init()\n"); asix_mdio_write (dev->net, dev->mii.phy_id, 0x1f, 0x0005); asix_mdio_write (dev->net, dev->mii.phy_id, 0x0c, 0); asix_mdio_write (dev->net, dev->mii.phy_id, 0x01, asix_mdio_read (dev->net, dev->mii.phy_id, 0x01) | 0x0080); asix_mdio_write (dev->net, dev->mii.phy_id, 0x1f, 0); if (data->ledmode == 12) { asix_mdio_write (dev->net, dev->mii.phy_id, 0x1f, 0x0002); asix_mdio_write (dev->net, dev->mii.phy_id, 0x1a, 0x00cb); asix_mdio_write (dev->net, dev->mii.phy_id, 0x1f, 0); } return 0; } static int marvell_led_status(struct usbnet *dev, u16 speed) { u16 reg = asix_mdio_read(dev->net, dev->mii.phy_id, MARVELL_LED_MANUAL); netdev_dbg(dev->net, "marvell_led_status() read 0x%04x\n", reg); /* Clear out the center LED bits - 0x03F0 */ reg &= 0xfc0f; switch (speed) { case SPEED_1000: reg |= 0x03e0; break; case SPEED_100: reg |= 0x03b0; break; default: reg |= 0x02f0; } netdev_dbg(dev->net, "marvell_led_status() writing 0x%04x\n", reg); asix_mdio_write(dev->net, dev->mii.phy_id, MARVELL_LED_MANUAL, reg); return 0; } static int ax88178_reset(struct usbnet *dev) { struct asix_data *data = (struct asix_data *)&dev->data; int ret; __le16 eeprom; u8 status; int gpio0 = 0; u32 phyid; ret = asix_read_cmd(dev, AX_CMD_READ_GPIOS, 0, 0, 1, &status, 0); if (ret < 0) { netdev_dbg(dev->net, "Failed to read GPIOS: %d\n", ret); return ret; } netdev_dbg(dev->net, "GPIO Status: 0x%04x\n", status); asix_write_cmd(dev, AX_CMD_WRITE_ENABLE, 0, 0, 0, NULL, 0); ret = asix_read_cmd(dev, AX_CMD_READ_EEPROM, 0x0017, 0, 2, &eeprom, 0); if (ret < 0) { netdev_dbg(dev->net, "Failed to read EEPROM: %d\n", ret); return ret; } asix_write_cmd(dev, AX_CMD_WRITE_DISABLE, 0, 0, 0, NULL, 0); netdev_dbg(dev->net, "EEPROM index 0x17 is 0x%04x\n", eeprom); if (eeprom == cpu_to_le16(0xffff)) { data->phymode = PHY_MODE_MARVELL; data->ledmode = 0; gpio0 = 1; } else { data->phymode = le16_to_cpu(eeprom) & 0x7F; data->ledmode = le16_to_cpu(eeprom) >> 8; gpio0 = (le16_to_cpu(eeprom) & 0x80) ? 0 : 1; } netdev_dbg(dev->net, "GPIO0: %d, PhyMode: %d\n", gpio0, data->phymode); /* Power up external GigaPHY through AX88178 GPIO pin */ asix_write_gpio(dev, AX_GPIO_RSE | AX_GPIO_GPO_1 | AX_GPIO_GPO1EN, 40, 0); if ((le16_to_cpu(eeprom) >> 8) != 1) { asix_write_gpio(dev, 0x003c, 30, 0); asix_write_gpio(dev, 0x001c, 300, 0); asix_write_gpio(dev, 0x003c, 30, 0); } else { netdev_dbg(dev->net, "gpio phymode == 1 path\n"); asix_write_gpio(dev, AX_GPIO_GPO1EN, 30, 0); asix_write_gpio(dev, AX_GPIO_GPO1EN | AX_GPIO_GPO_1, 30, 0); } /* Read PHYID register *AFTER* powering up PHY */ phyid = asix_get_phyid(dev); netdev_dbg(dev->net, "PHYID=0x%08x\n", phyid); /* Set AX88178 to enable MII/GMII/RGMII interface for external PHY */ asix_write_cmd(dev, AX_CMD_SW_PHY_SELECT, 0, 0, 0, NULL, 0); asix_sw_reset(dev, 0, 0); msleep(150); asix_sw_reset(dev, AX_SWRESET_PRL | AX_SWRESET_IPPD, 0); msleep(150); asix_write_rx_ctl(dev, 0, 0); if (data->phymode == PHY_MODE_MARVELL) { marvell_phy_init(dev); msleep(60); } else if (data->phymode == PHY_MODE_RTL8211CL) rtl8211cl_phy_init(dev); asix_phy_reset(dev, BMCR_RESET | BMCR_ANENABLE); asix_mdio_write(dev->net, dev->mii.phy_id, MII_ADVERTISE, ADVERTISE_ALL | ADVERTISE_CSMA | ADVERTISE_PAUSE_CAP); asix_mdio_write(dev->net, dev->mii.phy_id, MII_CTRL1000, ADVERTISE_1000FULL); asix_write_medium_mode(dev, AX88178_MEDIUM_DEFAULT, 0); mii_nway_restart(&dev->mii); /* Rewrite MAC address */ memcpy(data->mac_addr, dev->net->dev_addr, ETH_ALEN); ret = asix_write_cmd(dev, AX_CMD_WRITE_NODE_ID, 0, 0, ETH_ALEN, data->mac_addr, 0); if (ret < 0) return ret; ret = asix_write_rx_ctl(dev, AX_DEFAULT_RX_CTL, 0); if (ret < 0) return ret; return 0; } static int ax88178_link_reset(struct usbnet *dev) { u16 mode; struct ethtool_cmd ecmd = { .cmd = ETHTOOL_GSET }; struct asix_data *data = (struct asix_data *)&dev->data; u32 speed; netdev_dbg(dev->net, "ax88178_link_reset()\n"); mii_check_media(&dev->mii, 1, 1); mii_ethtool_gset(&dev->mii, &ecmd); mode = AX88178_MEDIUM_DEFAULT; speed = ethtool_cmd_speed(&ecmd); if (speed == SPEED_1000) mode |= AX_MEDIUM_GM; else if (speed == SPEED_100) mode |= AX_MEDIUM_PS; else mode &= ~(AX_MEDIUM_PS | AX_MEDIUM_GM); mode |= AX_MEDIUM_ENCK; if (ecmd.duplex == DUPLEX_FULL) mode |= AX_MEDIUM_FD; else mode &= ~AX_MEDIUM_FD; netdev_dbg(dev->net, "ax88178_link_reset() speed: %u duplex: %d setting mode to 0x%04x\n", speed, ecmd.duplex, mode); asix_write_medium_mode(dev, mode, 0); if (data->phymode == PHY_MODE_MARVELL && data->ledmode) marvell_led_status(dev, speed); return 0; } static void ax88178_set_mfb(struct usbnet *dev) { u16 mfb = AX_RX_CTL_MFB_16384; u16 rxctl; u16 medium; int old_rx_urb_size = dev->rx_urb_size; if (dev->hard_mtu < 2048) { dev->rx_urb_size = 2048; mfb = AX_RX_CTL_MFB_2048; } else if (dev->hard_mtu < 4096) { dev->rx_urb_size = 4096; mfb = AX_RX_CTL_MFB_4096; } else if (dev->hard_mtu < 8192) { dev->rx_urb_size = 8192; mfb = AX_RX_CTL_MFB_8192; } else if (dev->hard_mtu < 16384) { dev->rx_urb_size = 16384; mfb = AX_RX_CTL_MFB_16384; } rxctl = asix_read_rx_ctl(dev, 0); asix_write_rx_ctl(dev, (rxctl & ~AX_RX_CTL_MFB_16384) | mfb, 0); medium = asix_read_medium_status(dev, 0); if (dev->net->mtu > 1500) medium |= AX_MEDIUM_JFE; else medium &= ~AX_MEDIUM_JFE; asix_write_medium_mode(dev, medium, 0); if (dev->rx_urb_size > old_rx_urb_size) usbnet_unlink_rx_urbs(dev); } static int ax88178_change_mtu(struct net_device *net, int new_mtu) { struct usbnet *dev = netdev_priv(net); int ll_mtu = new_mtu + net->hard_header_len + 4; netdev_dbg(dev->net, "ax88178_change_mtu() new_mtu=%d\n", new_mtu); if ((ll_mtu % dev->maxpacket) == 0) return -EDOM; net->mtu = new_mtu; dev->hard_mtu = net->mtu + net->hard_header_len; ax88178_set_mfb(dev); /* max qlen depend on hard_mtu and rx_urb_size */ usbnet_update_max_qlen(dev); return 0; } static const struct net_device_ops ax88178_netdev_ops = { .ndo_open = usbnet_open, .ndo_stop = usbnet_stop, .ndo_start_xmit = usbnet_start_xmit, .ndo_tx_timeout = usbnet_tx_timeout, .ndo_get_stats64 = dev_get_tstats64, .ndo_set_mac_address = asix_set_mac_address, .ndo_validate_addr = eth_validate_addr, .ndo_set_rx_mode = asix_set_multicast, .ndo_eth_ioctl = asix_ioctl, .ndo_change_mtu = ax88178_change_mtu, }; static int ax88178_bind(struct usbnet *dev, struct usb_interface *intf) { int ret; u8 buf[ETH_ALEN] = {0}; usbnet_get_endpoints(dev,intf); /* Get the MAC address */ ret = asix_read_cmd(dev, AX_CMD_READ_NODE_ID, 0, 0, ETH_ALEN, buf, 0); if (ret < 0) { netdev_dbg(dev->net, "Failed to read MAC address: %d\n", ret); return ret; } asix_set_netdev_dev_addr(dev, buf); /* Initialize MII structure */ dev->mii.dev = dev->net; dev->mii.mdio_read = asix_mdio_read; dev->mii.mdio_write = asix_mdio_write; dev->mii.phy_id_mask = 0x1f; dev->mii.reg_num_mask = 0xff; dev->mii.supports_gmii = 1; dev->mii.phy_id = asix_read_phy_addr(dev, true); if (dev->mii.phy_id < 0) return dev->mii.phy_id; dev->net->netdev_ops = &ax88178_netdev_ops; dev->net->ethtool_ops = &ax88178_ethtool_ops; dev->net->max_mtu = 16384 - (dev->net->hard_header_len + 4); /* Blink LEDS so users know driver saw dongle */ asix_sw_reset(dev, 0, 0); msleep(150); asix_sw_reset(dev, AX_SWRESET_PRL | AX_SWRESET_IPPD, 0); msleep(150); /* Asix framing packs multiple eth frames into a 2K usb bulk transfer */ if (dev->driver_info->flags & FLAG_FRAMING_AX) { /* hard_mtu is still the default - the device does not support jumbo eth frames */ dev->rx_urb_size = 2048; } dev->driver_priv = kzalloc(sizeof(struct asix_common_private), GFP_KERNEL); if (!dev->driver_priv) return -ENOMEM; return 0; } static const struct driver_info ax8817x_info = { .description = "ASIX AX8817x USB 2.0 Ethernet", .bind = ax88172_bind, .status = asix_status, .link_reset = ax88172_link_reset, .reset = ax88172_link_reset, .flags = FLAG_ETHER | FLAG_LINK_INTR, .data = 0x00130103, }; static const struct driver_info dlink_dub_e100_info = { .description = "DLink DUB-E100 USB Ethernet", .bind = ax88172_bind, .status = asix_status, .link_reset = ax88172_link_reset, .reset = ax88172_link_reset, .flags = FLAG_ETHER | FLAG_LINK_INTR, .data = 0x009f9d9f, }; static const struct driver_info netgear_fa120_info = { .description = "Netgear FA-120 USB Ethernet", .bind = ax88172_bind, .status = asix_status, .link_reset = ax88172_link_reset, .reset = ax88172_link_reset, .flags = FLAG_ETHER | FLAG_LINK_INTR, .data = 0x00130103, }; static const struct driver_info hawking_uf200_info = { .description = "Hawking UF200 USB Ethernet", .bind = ax88172_bind, .status = asix_status, .link_reset = ax88172_link_reset, .reset = ax88172_link_reset, .flags = FLAG_ETHER | FLAG_LINK_INTR, .data = 0x001f1d1f, }; static const struct driver_info ax88772_info = { .description = "ASIX AX88772 USB 2.0 Ethernet", .bind = ax88772_bind, .unbind = ax88772_unbind, .status = asix_status, .reset = ax88772_reset, .stop = ax88772_stop, .flags = FLAG_ETHER | FLAG_FRAMING_AX | FLAG_LINK_INTR | FLAG_MULTI_PACKET, .rx_fixup = asix_rx_fixup_common, .tx_fixup = asix_tx_fixup, }; static const struct driver_info ax88772b_info = { .description = "ASIX AX88772B USB 2.0 Ethernet", .bind = ax88772_bind, .unbind = ax88772_unbind, .status = asix_status, .reset = ax88772_reset, .stop = ax88772_stop, .flags = FLAG_ETHER | FLAG_FRAMING_AX | FLAG_LINK_INTR | FLAG_MULTI_PACKET, .rx_fixup = asix_rx_fixup_common, .tx_fixup = asix_tx_fixup, .data = FLAG_EEPROM_MAC, }; static const struct driver_info lxausb_t1l_info = { .description = "Linux Automation GmbH USB 10Base-T1L", .bind = ax88772_bind, .unbind = ax88772_unbind, .status = asix_status, .reset = ax88772_reset, .stop = ax88772_stop, .flags = FLAG_ETHER | FLAG_FRAMING_AX | FLAG_LINK_INTR | FLAG_MULTI_PACKET, .rx_fixup = asix_rx_fixup_common, .tx_fixup = asix_tx_fixup, .data = FLAG_EEPROM_MAC, }; static const struct driver_info ax88178_info = { .description = "ASIX AX88178 USB 2.0 Ethernet", .bind = ax88178_bind, .unbind = ax88178_unbind, .status = asix_status, .link_reset = ax88178_link_reset, .reset = ax88178_reset, .flags = FLAG_ETHER | FLAG_FRAMING_AX | FLAG_LINK_INTR | FLAG_MULTI_PACKET, .rx_fixup = asix_rx_fixup_common, .tx_fixup = asix_tx_fixup, }; /* * USBLINK 20F9 "USB 2.0 LAN" USB ethernet adapter, typically found in * no-name packaging. * USB device strings are: * 1: Manufacturer: USBLINK * 2: Product: HG20F9 USB2.0 * 3: Serial: 000003 * Appears to be compatible with Asix 88772B. */ static const struct driver_info hg20f9_info = { .description = "HG20F9 USB 2.0 Ethernet", .bind = ax88772_bind, .unbind = ax88772_unbind, .status = asix_status, .reset = ax88772_reset, .flags = FLAG_ETHER | FLAG_FRAMING_AX | FLAG_LINK_INTR | FLAG_MULTI_PACKET, .rx_fixup = asix_rx_fixup_common, .tx_fixup = asix_tx_fixup, .data = FLAG_EEPROM_MAC, }; static const struct usb_device_id products [] = { { // Linksys USB200M USB_DEVICE (0x077b, 0x2226), .driver_info = (unsigned long) &ax8817x_info, }, { // Netgear FA120 USB_DEVICE (0x0846, 0x1040), .driver_info = (unsigned long) &netgear_fa120_info, }, { // DLink DUB-E100 USB_DEVICE (0x2001, 0x1a00), .driver_info = (unsigned long) &dlink_dub_e100_info, }, { // Intellinet, ST Lab USB Ethernet USB_DEVICE (0x0b95, 0x1720), .driver_info = (unsigned long) &ax8817x_info, }, { // Hawking UF200, TrendNet TU2-ET100 USB_DEVICE (0x07b8, 0x420a), .driver_info = (unsigned long) &hawking_uf200_info, }, { // Billionton Systems, USB2AR USB_DEVICE (0x08dd, 0x90ff), .driver_info = (unsigned long) &ax8817x_info, }, { // Billionton Systems, GUSB2AM-1G-B USB_DEVICE(0x08dd, 0x0114), .driver_info = (unsigned long) &ax88178_info, }, { // ATEN UC210T USB_DEVICE (0x0557, 0x2009), .driver_info = (unsigned long) &ax8817x_info, }, { // Buffalo LUA-U2-KTX USB_DEVICE (0x0411, 0x003d), .driver_info = (unsigned long) &ax8817x_info, }, { // Buffalo LUA-U2-GT 10/100/1000 USB_DEVICE (0x0411, 0x006e), .driver_info = (unsigned long) &ax88178_info, }, { // Sitecom LN-029 "USB 2.0 10/100 Ethernet adapter" USB_DEVICE (0x6189, 0x182d), .driver_info = (unsigned long) &ax8817x_info, }, { // Sitecom LN-031 "USB 2.0 10/100/1000 Ethernet adapter" USB_DEVICE (0x0df6, 0x0056), .driver_info = (unsigned long) &ax88178_info, }, { // Sitecom LN-028 "USB 2.0 10/100/1000 Ethernet adapter" USB_DEVICE (0x0df6, 0x061c), .driver_info = (unsigned long) &ax88178_info, }, { // corega FEther USB2-TX USB_DEVICE (0x07aa, 0x0017), .driver_info = (unsigned long) &ax8817x_info, }, { // Surecom EP-1427X-2 USB_DEVICE (0x1189, 0x0893), .driver_info = (unsigned long) &ax8817x_info, }, { // goodway corp usb gwusb2e USB_DEVICE (0x1631, 0x6200), .driver_info = (unsigned long) &ax8817x_info, }, { // JVC MP-PRX1 Port Replicator USB_DEVICE (0x04f1, 0x3008), .driver_info = (unsigned long) &ax8817x_info, }, { // Lenovo U2L100P 10/100 USB_DEVICE (0x17ef, 0x7203), .driver_info = (unsigned long)&ax88772b_info, }, { // ASIX AX88772B 10/100 USB_DEVICE (0x0b95, 0x772b), .driver_info = (unsigned long) &ax88772b_info, }, { // ASIX AX88772 10/100 USB_DEVICE (0x0b95, 0x7720), .driver_info = (unsigned long) &ax88772_info, }, { // ASIX AX88178 10/100/1000 USB_DEVICE (0x0b95, 0x1780), .driver_info = (unsigned long) &ax88178_info, }, { // Logitec LAN-GTJ/U2A USB_DEVICE (0x0789, 0x0160), .driver_info = (unsigned long) &ax88178_info, }, { // Linksys USB200M Rev 2 USB_DEVICE (0x13b1, 0x0018), .driver_info = (unsigned long) &ax88772_info, }, { // 0Q0 cable ethernet USB_DEVICE (0x1557, 0x7720), .driver_info = (unsigned long) &ax88772_info, }, { // DLink DUB-E100 H/W Ver B1 USB_DEVICE (0x07d1, 0x3c05), .driver_info = (unsigned long) &ax88772_info, }, { // DLink DUB-E100 H/W Ver B1 Alternate USB_DEVICE (0x2001, 0x3c05), .driver_info = (unsigned long) &ax88772_info, }, { // DLink DUB-E100 H/W Ver C1 USB_DEVICE (0x2001, 0x1a02), .driver_info = (unsigned long) &ax88772_info, }, { // Linksys USB1000 USB_DEVICE (0x1737, 0x0039), .driver_info = (unsigned long) &ax88178_info, }, { // IO-DATA ETG-US2 USB_DEVICE (0x04bb, 0x0930), .driver_info = (unsigned long) &ax88178_info, }, { // Belkin F5D5055 USB_DEVICE(0x050d, 0x5055), .driver_info = (unsigned long) &ax88178_info, }, { // Apple USB Ethernet Adapter USB_DEVICE(0x05ac, 0x1402), .driver_info = (unsigned long) &ax88772_info, }, { // Cables-to-Go USB Ethernet Adapter USB_DEVICE(0x0b95, 0x772a), .driver_info = (unsigned long) &ax88772_info, }, { // ABOCOM for pci USB_DEVICE(0x14ea, 0xab11), .driver_info = (unsigned long) &ax88178_info, }, { // ASIX 88772a USB_DEVICE(0x0db0, 0xa877), .driver_info = (unsigned long) &ax88772_info, }, { // Asus USB Ethernet Adapter USB_DEVICE (0x0b95, 0x7e2b), .driver_info = (unsigned long)&ax88772b_info, }, { /* ASIX 88172a demo board */ USB_DEVICE(0x0b95, 0x172a), .driver_info = (unsigned long) &ax88172a_info, }, { /* * USBLINK HG20F9 "USB 2.0 LAN" * Appears to have gazumped Linksys's manufacturer ID but * doesn't (yet) conflict with any known Linksys product. */ USB_DEVICE(0x066b, 0x20f9), .driver_info = (unsigned long) &hg20f9_info, }, { // Linux Automation GmbH USB 10Base-T1L USB_DEVICE(0x33f7, 0x0004), .driver_info = (unsigned long) &lxausb_t1l_info, }, { }, // END }; MODULE_DEVICE_TABLE(usb, products); static struct usb_driver asix_driver = { .name = DRIVER_NAME, .id_table = products, .probe = usbnet_probe, .suspend = asix_suspend, .resume = asix_resume, .reset_resume = asix_resume, .disconnect = usbnet_disconnect, .supports_autosuspend = 1, .disable_hub_initiated_lpm = 1, }; module_usb_driver(asix_driver); MODULE_AUTHOR("David Hollis"); MODULE_VERSION(DRIVER_VERSION); MODULE_DESCRIPTION("ASIX AX8817X based USB 2.0 Ethernet Devices"); MODULE_LICENSE("GPL"); |
15 14 13 4 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 | // SPDX-License-Identifier: GPL-2.0 /* * USB Serial Converter Bus specific functions * * Copyright (C) 2002 Greg Kroah-Hartman (greg@kroah.com) */ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/tty.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/usb.h> #include <linux/usb/serial.h> static int usb_serial_device_match(struct device *dev, struct device_driver *drv) { const struct usb_serial_port *port = to_usb_serial_port(dev); struct usb_serial_driver *driver = to_usb_serial_driver(drv); /* * drivers are already assigned to ports in serial_probe so it's * a simple check here. */ if (driver == port->serial->type) return 1; return 0; } static int usb_serial_device_probe(struct device *dev) { struct usb_serial_port *port = to_usb_serial_port(dev); struct usb_serial_driver *driver; struct device *tty_dev; int retval = 0; int minor; /* make sure suspend/resume doesn't race against port_probe */ retval = usb_autopm_get_interface(port->serial->interface); if (retval) return retval; driver = port->serial->type; if (driver->port_probe) { retval = driver->port_probe(port); if (retval) goto err_autopm_put; } minor = port->minor; tty_dev = tty_port_register_device(&port->port, usb_serial_tty_driver, minor, dev); if (IS_ERR(tty_dev)) { retval = PTR_ERR(tty_dev); goto err_port_remove; } usb_autopm_put_interface(port->serial->interface); dev_info(&port->serial->dev->dev, "%s converter now attached to ttyUSB%d\n", driver->description, minor); return 0; err_port_remove: if (driver->port_remove) driver->port_remove(port); err_autopm_put: usb_autopm_put_interface(port->serial->interface); return retval; } static void usb_serial_device_remove(struct device *dev) { struct usb_serial_port *port = to_usb_serial_port(dev); struct usb_serial_driver *driver; int minor; int autopm_err; /* * Make sure suspend/resume doesn't race against port_remove. * * Note that no further runtime PM callbacks will be made if * autopm_get fails. */ autopm_err = usb_autopm_get_interface(port->serial->interface); minor = port->minor; tty_unregister_device(usb_serial_tty_driver, minor); driver = port->serial->type; if (driver->port_remove) driver->port_remove(port); dev_info(dev, "%s converter now disconnected from ttyUSB%d\n", driver->description, minor); if (!autopm_err) usb_autopm_put_interface(port->serial->interface); } static ssize_t new_id_store(struct device_driver *driver, const char *buf, size_t count) { struct usb_serial_driver *usb_drv = to_usb_serial_driver(driver); ssize_t retval = usb_store_new_id(&usb_drv->dynids, usb_drv->id_table, driver, buf, count); if (retval >= 0 && usb_drv->usb_driver != NULL) retval = usb_store_new_id(&usb_drv->usb_driver->dynids, usb_drv->usb_driver->id_table, &usb_drv->usb_driver->driver, buf, count); return retval; } static ssize_t new_id_show(struct device_driver *driver, char *buf) { struct usb_serial_driver *usb_drv = to_usb_serial_driver(driver); return usb_show_dynids(&usb_drv->dynids, buf); } static DRIVER_ATTR_RW(new_id); static struct attribute *usb_serial_drv_attrs[] = { &driver_attr_new_id.attr, NULL, }; ATTRIBUTE_GROUPS(usb_serial_drv); static void free_dynids(struct usb_serial_driver *drv) { struct usb_dynid *dynid, *n; spin_lock(&drv->dynids.lock); list_for_each_entry_safe(dynid, n, &drv->dynids.list, node) { list_del(&dynid->node); kfree(dynid); } spin_unlock(&drv->dynids.lock); } const struct bus_type usb_serial_bus_type = { .name = "usb-serial", .match = usb_serial_device_match, .probe = usb_serial_device_probe, .remove = usb_serial_device_remove, .drv_groups = usb_serial_drv_groups, }; int usb_serial_bus_register(struct usb_serial_driver *driver) { int retval; driver->driver.bus = &usb_serial_bus_type; spin_lock_init(&driver->dynids.lock); INIT_LIST_HEAD(&driver->dynids.list); retval = driver_register(&driver->driver); return retval; } void usb_serial_bus_deregister(struct usb_serial_driver *driver) { free_dynids(driver); driver_unregister(&driver->driver); } |
2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_DMA_MAPPING_H #define _LINUX_DMA_MAPPING_H #include <linux/cache.h> #include <linux/sizes.h> #include <linux/string.h> #include <linux/device.h> #include <linux/err.h> #include <linux/dma-direction.h> #include <linux/scatterlist.h> #include <linux/bug.h> #include <linux/mem_encrypt.h> /** * List of possible attributes associated with a DMA mapping. The semantics * of each attribute should be defined in Documentation/core-api/dma-attributes.rst. */ /* * DMA_ATTR_WEAK_ORDERING: Specifies that reads and writes to the mapping * may be weakly ordered, that is that reads and writes may pass each other. */ #define DMA_ATTR_WEAK_ORDERING (1UL << 1) /* * DMA_ATTR_WRITE_COMBINE: Specifies that writes to the mapping may be * buffered to improve performance. */ #define DMA_ATTR_WRITE_COMBINE (1UL << 2) /* * DMA_ATTR_NO_KERNEL_MAPPING: Lets the platform to avoid creating a kernel * virtual mapping for the allocated buffer. */ #define DMA_ATTR_NO_KERNEL_MAPPING (1UL << 4) /* * DMA_ATTR_SKIP_CPU_SYNC: Allows platform code to skip synchronization of * the CPU cache for the given buffer assuming that it has been already * transferred to 'device' domain. */ #define DMA_ATTR_SKIP_CPU_SYNC (1UL << 5) /* * DMA_ATTR_FORCE_CONTIGUOUS: Forces contiguous allocation of the buffer * in physical memory. */ #define DMA_ATTR_FORCE_CONTIGUOUS (1UL << 6) /* * DMA_ATTR_ALLOC_SINGLE_PAGES: This is a hint to the DMA-mapping subsystem * that it's probably not worth the time to try to allocate memory to in a way * that gives better TLB efficiency. */ #define DMA_ATTR_ALLOC_SINGLE_PAGES (1UL << 7) /* * DMA_ATTR_NO_WARN: This tells the DMA-mapping subsystem to suppress * allocation failure reports (similarly to __GFP_NOWARN). */ #define DMA_ATTR_NO_WARN (1UL << 8) /* * DMA_ATTR_PRIVILEGED: used to indicate that the buffer is fully * accessible at an elevated privilege level (and ideally inaccessible or * at least read-only at lesser-privileged levels). */ #define DMA_ATTR_PRIVILEGED (1UL << 9) /* * A dma_addr_t can hold any valid DMA or bus address for the platform. It can * be given to a device to use as a DMA source or target. It is specific to a * given device and there may be a translation between the CPU physical address * space and the bus address space. * * DMA_MAPPING_ERROR is the magic error code if a mapping failed. It should not * be used directly in drivers, but checked for using dma_mapping_error() * instead. */ #define DMA_MAPPING_ERROR (~(dma_addr_t)0) #define DMA_BIT_MASK(n) (((n) == 64) ? ~0ULL : ((1ULL<<(n))-1)) #ifdef CONFIG_DMA_API_DEBUG void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr); void debug_dma_map_single(struct device *dev, const void *addr, unsigned long len); #else static inline void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr) { } static inline void debug_dma_map_single(struct device *dev, const void *addr, unsigned long len) { } #endif /* CONFIG_DMA_API_DEBUG */ #ifdef CONFIG_HAS_DMA static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) { debug_dma_mapping_error(dev, dma_addr); if (unlikely(dma_addr == DMA_MAPPING_ERROR)) return -ENOMEM; return 0; } dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, size_t offset, size_t size, enum dma_data_direction dir, unsigned long attrs); void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs); unsigned int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs); void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs); int dma_map_sgtable(struct device *dev, struct sg_table *sgt, enum dma_data_direction dir, unsigned long attrs); dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size, enum dma_data_direction dir, unsigned long attrs); void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs); void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir); void dma_sync_single_for_device(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir); void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, enum dma_data_direction dir); void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, enum dma_data_direction dir); void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag, unsigned long attrs); void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_handle, unsigned long attrs); void *dmam_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs); void dmam_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle); int dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs); int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs); bool dma_can_mmap(struct device *dev); bool dma_pci_p2pdma_supported(struct device *dev); int dma_set_mask(struct device *dev, u64 mask); int dma_set_coherent_mask(struct device *dev, u64 mask); u64 dma_get_required_mask(struct device *dev); bool dma_addressing_limited(struct device *dev); size_t dma_max_mapping_size(struct device *dev); size_t dma_opt_mapping_size(struct device *dev); bool dma_need_sync(struct device *dev, dma_addr_t dma_addr); unsigned long dma_get_merge_boundary(struct device *dev); struct sg_table *dma_alloc_noncontiguous(struct device *dev, size_t size, enum dma_data_direction dir, gfp_t gfp, unsigned long attrs); void dma_free_noncontiguous(struct device *dev, size_t size, struct sg_table *sgt, enum dma_data_direction dir); void *dma_vmap_noncontiguous(struct device *dev, size_t size, struct sg_table *sgt); void dma_vunmap_noncontiguous(struct device *dev, void *vaddr); int dma_mmap_noncontiguous(struct device *dev, struct vm_area_struct *vma, size_t size, struct sg_table *sgt); #else /* CONFIG_HAS_DMA */ static inline dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, size_t offset, size_t size, enum dma_data_direction dir, unsigned long attrs) { return DMA_MAPPING_ERROR; } static inline void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { } static inline unsigned int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs) { return 0; } static inline void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs) { } static inline int dma_map_sgtable(struct device *dev, struct sg_table *sgt, enum dma_data_direction dir, unsigned long attrs) { return -EOPNOTSUPP; } static inline dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { return DMA_MAPPING_ERROR; } static inline void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { } static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir) { } static inline void dma_sync_single_for_device(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir) { } static inline void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, enum dma_data_direction dir) { } static inline void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, enum dma_data_direction dir) { } static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) { return -ENOMEM; } static inline void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag, unsigned long attrs) { return NULL; } static void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_handle, unsigned long attrs) { } static inline void *dmam_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) { return NULL; } static inline void dmam_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle) { } static inline int dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs) { return -ENXIO; } static inline int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs) { return -ENXIO; } static inline bool dma_can_mmap(struct device *dev) { return false; } static inline bool dma_pci_p2pdma_supported(struct device *dev) { return false; } static inline int dma_set_mask(struct device *dev, u64 mask) { return -EIO; } static inline int dma_set_coherent_mask(struct device *dev, u64 mask) { return -EIO; } static inline u64 dma_get_required_mask(struct device *dev) { return 0; } static inline bool dma_addressing_limited(struct device *dev) { return false; } static inline size_t dma_max_mapping_size(struct device *dev) { return 0; } static inline size_t dma_opt_mapping_size(struct device *dev) { return 0; } static inline bool dma_need_sync(struct device *dev, dma_addr_t dma_addr) { return false; } static inline unsigned long dma_get_merge_boundary(struct device *dev) { return 0; } static inline struct sg_table *dma_alloc_noncontiguous(struct device *dev, size_t size, enum dma_data_direction dir, gfp_t gfp, unsigned long attrs) { return NULL; } static inline void dma_free_noncontiguous(struct device *dev, size_t size, struct sg_table *sgt, enum dma_data_direction dir) { } static inline void *dma_vmap_noncontiguous(struct device *dev, size_t size, struct sg_table *sgt) { return NULL; } static inline void dma_vunmap_noncontiguous(struct device *dev, void *vaddr) { } static inline int dma_mmap_noncontiguous(struct device *dev, struct vm_area_struct *vma, size_t size, struct sg_table *sgt) { return -EINVAL; } #endif /* CONFIG_HAS_DMA */ struct page *dma_alloc_pages(struct device *dev, size_t size, dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp); void dma_free_pages(struct device *dev, size_t size, struct page *page, dma_addr_t dma_handle, enum dma_data_direction dir); int dma_mmap_pages(struct device *dev, struct vm_area_struct *vma, size_t size, struct page *page); static inline void *dma_alloc_noncoherent(struct device *dev, size_t size, dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp) { struct page *page = dma_alloc_pages(dev, size, dma_handle, dir, gfp); return page ? page_address(page) : NULL; } static inline void dma_free_noncoherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle, enum dma_data_direction dir) { dma_free_pages(dev, size, virt_to_page(vaddr), dma_handle, dir); } static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr, size_t size, enum dma_data_direction dir, unsigned long attrs) { /* DMA must never operate on areas that might be remapped. */ if (dev_WARN_ONCE(dev, is_vmalloc_addr(ptr), "rejecting DMA map of vmalloc memory\n")) return DMA_MAPPING_ERROR; debug_dma_map_single(dev, ptr, size); return dma_map_page_attrs(dev, virt_to_page(ptr), offset_in_page(ptr), size, dir, attrs); } static inline void dma_unmap_single_attrs(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { return dma_unmap_page_attrs(dev, addr, size, dir, attrs); } static inline void dma_sync_single_range_for_cpu(struct device *dev, dma_addr_t addr, unsigned long offset, size_t size, enum dma_data_direction dir) { return dma_sync_single_for_cpu(dev, addr + offset, size, dir); } static inline void dma_sync_single_range_for_device(struct device *dev, dma_addr_t addr, unsigned long offset, size_t size, enum dma_data_direction dir) { return dma_sync_single_for_device(dev, addr + offset, size, dir); } /** * dma_unmap_sgtable - Unmap the given buffer for DMA * @dev: The device for which to perform the DMA operation * @sgt: The sg_table object describing the buffer * @dir: DMA direction * @attrs: Optional DMA attributes for the unmap operation * * Unmaps a buffer described by a scatterlist stored in the given sg_table * object for the @dir DMA operation by the @dev device. After this function * the ownership of the buffer is transferred back to the CPU domain. */ static inline void dma_unmap_sgtable(struct device *dev, struct sg_table *sgt, enum dma_data_direction dir, unsigned long attrs) { dma_unmap_sg_attrs(dev, sgt->sgl, sgt->orig_nents, dir, attrs); } /** * dma_sync_sgtable_for_cpu - Synchronize the given buffer for CPU access * @dev: The device for which to perform the DMA operation * @sgt: The sg_table object describing the buffer * @dir: DMA direction * * Performs the needed cache synchronization and moves the ownership of the * buffer back to the CPU domain, so it is safe to perform any access to it * by the CPU. Before doing any further DMA operations, one has to transfer * the ownership of the buffer back to the DMA domain by calling the * dma_sync_sgtable_for_device(). */ static inline void dma_sync_sgtable_for_cpu(struct device *dev, struct sg_table *sgt, enum dma_data_direction dir) { dma_sync_sg_for_cpu(dev, sgt->sgl, sgt->orig_nents, dir); } /** * dma_sync_sgtable_for_device - Synchronize the given buffer for DMA * @dev: The device for which to perform the DMA operation * @sgt: The sg_table object describing the buffer * @dir: DMA direction * * Performs the needed cache synchronization and moves the ownership of the * buffer back to the DMA domain, so it is safe to perform the DMA operation. * Once finished, one has to call dma_sync_sgtable_for_cpu() or * dma_unmap_sgtable(). */ static inline void dma_sync_sgtable_for_device(struct device *dev, struct sg_table *sgt, enum dma_data_direction dir) { dma_sync_sg_for_device(dev, sgt->sgl, sgt->orig_nents, dir); } #define dma_map_single(d, a, s, r) dma_map_single_attrs(d, a, s, r, 0) #define dma_unmap_single(d, a, s, r) dma_unmap_single_attrs(d, a, s, r, 0) #define dma_map_sg(d, s, n, r) dma_map_sg_attrs(d, s, n, r, 0) #define dma_unmap_sg(d, s, n, r) dma_unmap_sg_attrs(d, s, n, r, 0) #define dma_map_page(d, p, o, s, r) dma_map_page_attrs(d, p, o, s, r, 0) #define dma_unmap_page(d, a, s, r) dma_unmap_page_attrs(d, a, s, r, 0) #define dma_get_sgtable(d, t, v, h, s) dma_get_sgtable_attrs(d, t, v, h, s, 0) #define dma_mmap_coherent(d, v, c, h, s) dma_mmap_attrs(d, v, c, h, s, 0) bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size); static inline void *dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp) { return dma_alloc_attrs(dev, size, dma_handle, gfp, (gfp & __GFP_NOWARN) ? DMA_ATTR_NO_WARN : 0); } static inline void dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_handle) { return dma_free_attrs(dev, size, cpu_addr, dma_handle, 0); } static inline u64 dma_get_mask(struct device *dev) { if (dev->dma_mask && *dev->dma_mask) return *dev->dma_mask; return DMA_BIT_MASK(32); } /* * Set both the DMA mask and the coherent DMA mask to the same thing. * Note that we don't check the return value from dma_set_coherent_mask() * as the DMA API guarantees that the coherent DMA mask can be set to * the same or smaller than the streaming DMA mask. */ static inline int dma_set_mask_and_coherent(struct device *dev, u64 mask) { int rc = dma_set_mask(dev, mask); if (rc == 0) dma_set_coherent_mask(dev, mask); return rc; } /* * Similar to the above, except it deals with the case where the device * does not have dev->dma_mask appropriately setup. */ static inline int dma_coerce_mask_and_coherent(struct device *dev, u64 mask) { dev->dma_mask = &dev->coherent_dma_mask; return dma_set_mask_and_coherent(dev, mask); } static inline unsigned int dma_get_max_seg_size(struct device *dev) { if (dev->dma_parms && dev->dma_parms->max_segment_size) return dev->dma_parms->max_segment_size; return SZ_64K; } static inline int dma_set_max_seg_size(struct device *dev, unsigned int size) { if (dev->dma_parms) { dev->dma_parms->max_segment_size = size; return 0; } return -EIO; } static inline unsigned long dma_get_seg_boundary(struct device *dev) { if (dev->dma_parms && dev->dma_parms->segment_boundary_mask) return dev->dma_parms->segment_boundary_mask; return ULONG_MAX; } /** * dma_get_seg_boundary_nr_pages - return the segment boundary in "page" units * @dev: device to guery the boundary for * @page_shift: ilog() of the IOMMU page size * * Return the segment boundary in IOMMU page units (which may be different from * the CPU page size) for the passed in device. * * If @dev is NULL a boundary of U32_MAX is assumed, this case is just for * non-DMA API callers. */ static inline unsigned long dma_get_seg_boundary_nr_pages(struct device *dev, unsigned int page_shift) { if (!dev) return (U32_MAX >> page_shift) + 1; return (dma_get_seg_boundary(dev) >> page_shift) + 1; } static inline int dma_set_seg_boundary(struct device *dev, unsigned long mask) { if (dev->dma_parms) { dev->dma_parms->segment_boundary_mask = mask; return 0; } return -EIO; } static inline unsigned int dma_get_min_align_mask(struct device *dev) { if (dev->dma_parms) return dev->dma_parms->min_align_mask; return 0; } static inline int dma_set_min_align_mask(struct device *dev, unsigned int min_align_mask) { if (WARN_ON_ONCE(!dev->dma_parms)) return -EIO; dev->dma_parms->min_align_mask = min_align_mask; return 0; } #ifndef dma_get_cache_alignment static inline int dma_get_cache_alignment(void) { #ifdef ARCH_HAS_DMA_MINALIGN return ARCH_DMA_MINALIGN; #endif return 1; } #endif static inline void *dmam_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp) { return dmam_alloc_attrs(dev, size, dma_handle, gfp, (gfp & __GFP_NOWARN) ? DMA_ATTR_NO_WARN : 0); } static inline void *dma_alloc_wc(struct device *dev, size_t size, dma_addr_t *dma_addr, gfp_t gfp) { unsigned long attrs = DMA_ATTR_WRITE_COMBINE; if (gfp & __GFP_NOWARN) attrs |= DMA_ATTR_NO_WARN; return dma_alloc_attrs(dev, size, dma_addr, gfp, attrs); } static inline void dma_free_wc(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_addr) { return dma_free_attrs(dev, size, cpu_addr, dma_addr, DMA_ATTR_WRITE_COMBINE); } static inline int dma_mmap_wc(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, dma_addr_t dma_addr, size_t size) { return dma_mmap_attrs(dev, vma, cpu_addr, dma_addr, size, DMA_ATTR_WRITE_COMBINE); } #ifdef CONFIG_NEED_DMA_MAP_STATE #define DEFINE_DMA_UNMAP_ADDR(ADDR_NAME) dma_addr_t ADDR_NAME #define DEFINE_DMA_UNMAP_LEN(LEN_NAME) __u32 LEN_NAME #define dma_unmap_addr(PTR, ADDR_NAME) ((PTR)->ADDR_NAME) #define dma_unmap_addr_set(PTR, ADDR_NAME, VAL) (((PTR)->ADDR_NAME) = (VAL)) #define dma_unmap_len(PTR, LEN_NAME) ((PTR)->LEN_NAME) #define dma_unmap_len_set(PTR, LEN_NAME, VAL) (((PTR)->LEN_NAME) = (VAL)) #else #define DEFINE_DMA_UNMAP_ADDR(ADDR_NAME) #define DEFINE_DMA_UNMAP_LEN(LEN_NAME) #define dma_unmap_addr(PTR, ADDR_NAME) (0) #define dma_unmap_addr_set(PTR, ADDR_NAME, VAL) do { } while (0) #define dma_unmap_len(PTR, LEN_NAME) (0) #define dma_unmap_len_set(PTR, LEN_NAME, VAL) do { } while (0) #endif #endif /* _LINUX_DMA_MAPPING_H */ |
7 1 1 1 4 4 4 2 1 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 | // SPDX-License-Identifier: GPL-2.0-only /* * ebt_ip * * Authors: * Bart De Schuymer <bdschuym@pandora.be> * * April, 2002 * * Changes: * added ip-sport and ip-dport * Innominate Security Technologies AG <mhopf@innominate.com> * September, 2002 */ #include <linux/ip.h> #include <net/ip.h> #include <linux/in.h> #include <linux/module.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_bridge/ebtables.h> #include <linux/netfilter_bridge/ebt_ip.h> union pkthdr { struct { __be16 src; __be16 dst; } tcpudphdr; struct { u8 type; u8 code; } icmphdr; struct { u8 type; } igmphdr; }; static bool ebt_ip_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct ebt_ip_info *info = par->matchinfo; const struct iphdr *ih; struct iphdr _iph; const union pkthdr *pptr; union pkthdr _pkthdr; ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph); if (ih == NULL) return false; if ((info->bitmask & EBT_IP_TOS) && NF_INVF(info, EBT_IP_TOS, info->tos != ih->tos)) return false; if ((info->bitmask & EBT_IP_SOURCE) && NF_INVF(info, EBT_IP_SOURCE, (ih->saddr & info->smsk) != info->saddr)) return false; if ((info->bitmask & EBT_IP_DEST) && NF_INVF(info, EBT_IP_DEST, (ih->daddr & info->dmsk) != info->daddr)) return false; if (info->bitmask & EBT_IP_PROTO) { if (NF_INVF(info, EBT_IP_PROTO, info->protocol != ih->protocol)) return false; if (!(info->bitmask & (EBT_IP_DPORT | EBT_IP_SPORT | EBT_IP_ICMP | EBT_IP_IGMP))) return true; if (ntohs(ih->frag_off) & IP_OFFSET) return false; /* min icmp/igmp headersize is 4, so sizeof(_pkthdr) is ok. */ pptr = skb_header_pointer(skb, ih->ihl*4, sizeof(_pkthdr), &_pkthdr); if (pptr == NULL) return false; if (info->bitmask & EBT_IP_DPORT) { u32 dst = ntohs(pptr->tcpudphdr.dst); if (NF_INVF(info, EBT_IP_DPORT, dst < info->dport[0] || dst > info->dport[1])) return false; } if (info->bitmask & EBT_IP_SPORT) { u32 src = ntohs(pptr->tcpudphdr.src); if (NF_INVF(info, EBT_IP_SPORT, src < info->sport[0] || src > info->sport[1])) return false; } if ((info->bitmask & EBT_IP_ICMP) && NF_INVF(info, EBT_IP_ICMP, pptr->icmphdr.type < info->icmp_type[0] || pptr->icmphdr.type > info->icmp_type[1] || pptr->icmphdr.code < info->icmp_code[0] || pptr->icmphdr.code > info->icmp_code[1])) return false; if ((info->bitmask & EBT_IP_IGMP) && NF_INVF(info, EBT_IP_IGMP, pptr->igmphdr.type < info->igmp_type[0] || pptr->igmphdr.type > info->igmp_type[1])) return false; } return true; } static int ebt_ip_mt_check(const struct xt_mtchk_param *par) { const struct ebt_ip_info *info = par->matchinfo; const struct ebt_entry *e = par->entryinfo; if (e->ethproto != htons(ETH_P_IP) || e->invflags & EBT_IPROTO) return -EINVAL; if (info->bitmask & ~EBT_IP_MASK || info->invflags & ~EBT_IP_MASK) return -EINVAL; if (info->bitmask & (EBT_IP_DPORT | EBT_IP_SPORT)) { if (info->invflags & EBT_IP_PROTO) return -EINVAL; if (info->protocol != IPPROTO_TCP && info->protocol != IPPROTO_UDP && info->protocol != IPPROTO_UDPLITE && info->protocol != IPPROTO_SCTP && info->protocol != IPPROTO_DCCP) return -EINVAL; } if (info->bitmask & EBT_IP_DPORT && info->dport[0] > info->dport[1]) return -EINVAL; if (info->bitmask & EBT_IP_SPORT && info->sport[0] > info->sport[1]) return -EINVAL; if (info->bitmask & EBT_IP_ICMP) { if ((info->invflags & EBT_IP_PROTO) || info->protocol != IPPROTO_ICMP) return -EINVAL; if (info->icmp_type[0] > info->icmp_type[1] || info->icmp_code[0] > info->icmp_code[1]) return -EINVAL; } if (info->bitmask & EBT_IP_IGMP) { if ((info->invflags & EBT_IP_PROTO) || info->protocol != IPPROTO_IGMP) return -EINVAL; if (info->igmp_type[0] > info->igmp_type[1]) return -EINVAL; } return 0; } static struct xt_match ebt_ip_mt_reg __read_mostly = { .name = "ip", .revision = 0, .family = NFPROTO_BRIDGE, .match = ebt_ip_mt, .checkentry = ebt_ip_mt_check, .matchsize = sizeof(struct ebt_ip_info), .me = THIS_MODULE, }; static int __init ebt_ip_init(void) { return xt_register_match(&ebt_ip_mt_reg); } static void __exit ebt_ip_fini(void) { xt_unregister_match(&ebt_ip_mt_reg); } module_init(ebt_ip_init); module_exit(ebt_ip_fini); MODULE_DESCRIPTION("Ebtables: IPv4 protocol packet match"); MODULE_LICENSE("GPL"); |
5 475 500 244 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_TTY_H #define _LINUX_TTY_H #include <linux/fs.h> #include <linux/major.h> #include <linux/termios.h> #include <linux/workqueue.h> #include <linux/tty_buffer.h> #include <linux/tty_driver.h> #include <linux/tty_ldisc.h> #include <linux/tty_port.h> #include <linux/mutex.h> #include <linux/tty_flags.h> #include <uapi/linux/tty.h> #include <linux/rwsem.h> #include <linux/llist.h> /* * (Note: the *_driver.minor_start values 1, 64, 128, 192 are * hardcoded at present.) */ #define NR_UNIX98_PTY_DEFAULT 4096 /* Default maximum for Unix98 ptys */ #define NR_UNIX98_PTY_RESERVE 1024 /* Default reserve for main devpts */ #define NR_UNIX98_PTY_MAX (1 << MINORBITS) /* Absolute limit */ /* * This character is the same as _POSIX_VDISABLE: it cannot be used as * a c_cc[] character, but indicates that a particular special character * isn't in use (eg VINTR has no character etc) */ #define __DISABLED_CHAR '\0' #define INTR_CHAR(tty) ((tty)->termios.c_cc[VINTR]) #define QUIT_CHAR(tty) ((tty)->termios.c_cc[VQUIT]) #define ERASE_CHAR(tty) ((tty)->termios.c_cc[VERASE]) #define KILL_CHAR(tty) ((tty)->termios.c_cc[VKILL]) #define EOF_CHAR(tty) ((tty)->termios.c_cc[VEOF]) #define TIME_CHAR(tty) ((tty)->termios.c_cc[VTIME]) #define MIN_CHAR(tty) ((tty)->termios.c_cc[VMIN]) #define SWTC_CHAR(tty) ((tty)->termios.c_cc[VSWTC]) #define START_CHAR(tty) ((tty)->termios.c_cc[VSTART]) #define STOP_CHAR(tty) ((tty)->termios.c_cc[VSTOP]) #define SUSP_CHAR(tty) ((tty)->termios.c_cc[VSUSP]) #define EOL_CHAR(tty) ((tty)->termios.c_cc[VEOL]) #define REPRINT_CHAR(tty) ((tty)->termios.c_cc[VREPRINT]) #define DISCARD_CHAR(tty) ((tty)->termios.c_cc[VDISCARD]) #define WERASE_CHAR(tty) ((tty)->termios.c_cc[VWERASE]) #define LNEXT_CHAR(tty) ((tty)->termios.c_cc[VLNEXT]) #define EOL2_CHAR(tty) ((tty)->termios.c_cc[VEOL2]) #define _I_FLAG(tty, f) ((tty)->termios.c_iflag & (f)) #define _O_FLAG(tty, f) ((tty)->termios.c_oflag & (f)) #define _C_FLAG(tty, f) ((tty)->termios.c_cflag & (f)) #define _L_FLAG(tty, f) ((tty)->termios.c_lflag & (f)) #define I_IGNBRK(tty) _I_FLAG((tty), IGNBRK) #define I_BRKINT(tty) _I_FLAG((tty), BRKINT) #define I_IGNPAR(tty) _I_FLAG((tty), IGNPAR) #define I_PARMRK(tty) _I_FLAG((tty), PARMRK) #define I_INPCK(tty) _I_FLAG((tty), INPCK) #define I_ISTRIP(tty) _I_FLAG((tty), ISTRIP) #define I_INLCR(tty) _I_FLAG((tty), INLCR) #define I_IGNCR(tty) _I_FLAG((tty), IGNCR) #define I_ICRNL(tty) _I_FLAG((tty), ICRNL) #define I_IUCLC(tty) _I_FLAG((tty), IUCLC) #define I_IXON(tty) _I_FLAG((tty), IXON) #define I_IXANY(tty) _I_FLAG((tty), IXANY) #define I_IXOFF(tty) _I_FLAG((tty), IXOFF) #define I_IMAXBEL(tty) _I_FLAG((tty), IMAXBEL) #define I_IUTF8(tty) _I_FLAG((tty), IUTF8) #define O_OPOST(tty) _O_FLAG((tty), OPOST) #define O_OLCUC(tty) _O_FLAG((tty), OLCUC) #define O_ONLCR(tty) _O_FLAG((tty), ONLCR) #define O_OCRNL(tty) _O_FLAG((tty), OCRNL) #define O_ONOCR(tty) _O_FLAG((tty), ONOCR) #define O_ONLRET(tty) _O_FLAG((tty), ONLRET) #define O_OFILL(tty) _O_FLAG((tty), OFILL) #define O_OFDEL(tty) _O_FLAG((tty), OFDEL) #define O_NLDLY(tty) _O_FLAG((tty), NLDLY) #define O_CRDLY(tty) _O_FLAG((tty), CRDLY) #define O_TABDLY(tty) _O_FLAG((tty), TABDLY) #define O_BSDLY(tty) _O_FLAG((tty), BSDLY) #define O_VTDLY(tty) _O_FLAG((tty), VTDLY) #define O_FFDLY(tty) _O_FLAG((tty), FFDLY) #define C_BAUD(tty) _C_FLAG((tty), CBAUD) #define C_CSIZE(tty) _C_FLAG((tty), CSIZE) #define C_CSTOPB(tty) _C_FLAG((tty), CSTOPB) #define C_CREAD(tty) _C_FLAG((tty), CREAD) #define C_PARENB(tty) _C_FLAG((tty), PARENB) #define C_PARODD(tty) _C_FLAG((tty), PARODD) #define C_HUPCL(tty) _C_FLAG((tty), HUPCL) #define C_CLOCAL(tty) _C_FLAG((tty), CLOCAL) #define C_CIBAUD(tty) _C_FLAG((tty), CIBAUD) #define C_CRTSCTS(tty) _C_FLAG((tty), CRTSCTS) #define C_CMSPAR(tty) _C_FLAG((tty), CMSPAR) #define L_ISIG(tty) _L_FLAG((tty), ISIG) #define L_ICANON(tty) _L_FLAG((tty), ICANON) #define L_XCASE(tty) _L_FLAG((tty), XCASE) #define L_ECHO(tty) _L_FLAG((tty), ECHO) #define L_ECHOE(tty) _L_FLAG((tty), ECHOE) #define L_ECHOK(tty) _L_FLAG((tty), ECHOK) #define L_ECHONL(tty) _L_FLAG((tty), ECHONL) #define L_NOFLSH(tty) _L_FLAG((tty), NOFLSH) #define L_TOSTOP(tty) _L_FLAG((tty), TOSTOP) #define L_ECHOCTL(tty) _L_FLAG((tty), ECHOCTL) #define L_ECHOPRT(tty) _L_FLAG((tty), ECHOPRT) #define L_ECHOKE(tty) _L_FLAG((tty), ECHOKE) #define L_FLUSHO(tty) _L_FLAG((tty), FLUSHO) #define L_PENDIN(tty) _L_FLAG((tty), PENDIN) #define L_IEXTEN(tty) _L_FLAG((tty), IEXTEN) #define L_EXTPROC(tty) _L_FLAG((tty), EXTPROC) struct device; struct signal_struct; struct tty_operations; /** * struct tty_struct - state associated with a tty while open * * @kref: reference counting by tty_kref_get() and tty_kref_put(), reaching zero * frees the structure * @dev: class device or %NULL (e.g. ptys, serdev) * @driver: &struct tty_driver operating this tty * @ops: &struct tty_operations of @driver for this tty (open, close, etc.) * @index: index of this tty (e.g. to construct @name like tty12) * @ldisc_sem: protects line discipline changes (@ldisc) -- lock tty not pty * @ldisc: the current line discipline for this tty (n_tty by default) * @atomic_write_lock: protects against concurrent writers, i.e. locks * @write_cnt, @write_buf and similar * @legacy_mutex: leftover from history (BKL -> BTM -> @legacy_mutex), * protecting several operations on this tty * @throttle_mutex: protects against concurrent tty_throttle_safe() and * tty_unthrottle_safe() (but not tty_unthrottle()) * @termios_rwsem: protects @termios and @termios_locked * @winsize_mutex: protects @winsize * @termios: termios for the current tty, copied from/to @driver.termios * @termios_locked: locked termios (by %TIOCGLCKTRMIOS and %TIOCSLCKTRMIOS * ioctls) * @name: name of the tty constructed by tty_line_name() (e.g. ttyS3) * @flags: bitwise OR of %TTY_THROTTLED, %TTY_IO_ERROR, ... * @count: count of open processes, reaching zero cancels all the work for * this tty and drops a @kref too (but does not free this tty) * @winsize: size of the terminal "window" (cf. @winsize_mutex) * @flow: flow settings grouped together, see also @flow.unused * @flow.lock: lock for @flow members * @flow.stopped: tty stopped/started by stop_tty()/start_tty() * @flow.tco_stopped: tty stopped/started by %TCOOFF/%TCOON ioctls (it has * precedence over @flow.stopped) * @flow.unused: alignment for Alpha, so that no members other than @flow.* are * modified by the same 64b word store. The @flow's __aligned is * there for the very same reason. * @ctrl: control settings grouped together, see also @ctrl.unused * @ctrl.lock: lock for @ctrl members * @ctrl.pgrp: process group of this tty (setpgrp(2)) * @ctrl.session: session of this tty (setsid(2)). Writes are protected by both * @ctrl.lock and @legacy_mutex, readers must use at least one of * them. * @ctrl.pktstatus: packet mode status (bitwise OR of %TIOCPKT_ constants) * @ctrl.packet: packet mode enabled * @ctrl.unused: alignment for Alpha, see @flow.unused for explanation * @hw_stopped: not controlled by the tty layer, under @driver's control for CTS * handling * @receive_room: bytes permitted to feed to @ldisc without any being lost * @flow_change: controls behavior of throttling, see tty_throttle_safe() and * tty_unthrottle_safe() * @link: link to another pty (master -> slave and vice versa) * @fasync: state for %O_ASYNC (for %SIGIO); managed by fasync_helper() * @write_wait: concurrent writers are waiting in this queue until they are * allowed to write * @read_wait: readers wait for data in this queue * @hangup_work: normally a work to perform a hangup (do_tty_hangup()); while * freeing the tty, (re)used to release_one_tty() * @disc_data: pointer to @ldisc's private data (e.g. to &struct n_tty_data) * @driver_data: pointer to @driver's private data (e.g. &struct uart_state) * @files_lock: protects @tty_files list * @tty_files: list of (re)openers of this tty (i.e. linked &struct * tty_file_private) * @closing: when set during close, n_tty processes only START & STOP chars * @write_buf: temporary buffer used during tty_write() to copy user data to * @write_cnt: count of bytes written in tty_write() to @write_buf * @SAK_work: if the tty has a pending do_SAK, it is queued here * @port: persistent storage for this device (i.e. &struct tty_port) * * All of the state associated with a tty while the tty is open. Persistent * storage for tty devices is referenced here as @port and is documented in * &struct tty_port. */ struct tty_struct { struct kref kref; int index; struct device *dev; struct tty_driver *driver; struct tty_port *port; const struct tty_operations *ops; struct tty_ldisc *ldisc; struct ld_semaphore ldisc_sem; struct mutex atomic_write_lock; struct mutex legacy_mutex; struct mutex throttle_mutex; struct rw_semaphore termios_rwsem; struct mutex winsize_mutex; struct ktermios termios, termios_locked; char name[64]; unsigned long flags; int count; unsigned int receive_room; struct winsize winsize; struct { spinlock_t lock; bool stopped; bool tco_stopped; unsigned long unused[0]; } __aligned(sizeof(unsigned long)) flow; struct { struct pid *pgrp; struct pid *session; spinlock_t lock; unsigned char pktstatus; bool packet; unsigned long unused[0]; } __aligned(sizeof(unsigned long)) ctrl; bool hw_stopped; bool closing; int flow_change; struct tty_struct *link; struct fasync_struct *fasync; wait_queue_head_t write_wait; wait_queue_head_t read_wait; struct work_struct hangup_work; void *disc_data; void *driver_data; spinlock_t files_lock; int write_cnt; u8 *write_buf; struct list_head tty_files; #define N_TTY_BUF_SIZE 4096 struct work_struct SAK_work; } __randomize_layout; /* Each of a tty's open files has private_data pointing to tty_file_private */ struct tty_file_private { struct tty_struct *tty; struct file *file; struct list_head list; }; /** * DOC: TTY Struct Flags * * These bits are used in the :c:member:`tty_struct.flags` field. * * So that interrupts won't be able to mess up the queues, * copy_to_cooked must be atomic with respect to itself, as must * tty->write. Thus, you must use the inline functions set_bit() and * clear_bit() to make things atomic. * * TTY_THROTTLED * Driver input is throttled. The ldisc should call * :c:member:`tty_driver.unthrottle()` in order to resume reception when * it is ready to process more data (at threshold min). * * TTY_IO_ERROR * If set, causes all subsequent userspace read/write calls on the tty to * fail, returning -%EIO. (May be no ldisc too.) * * TTY_OTHER_CLOSED * Device is a pty and the other side has closed. * * TTY_EXCLUSIVE * Exclusive open mode (a single opener). * * TTY_DO_WRITE_WAKEUP * If set, causes the driver to call the * :c:member:`tty_ldisc_ops.write_wakeup()` method in order to resume * transmission when it can accept more data to transmit. * * TTY_LDISC_OPEN * Indicates that a line discipline is open. For debugging purposes only. * * TTY_PTY_LOCK * A flag private to pty code to implement %TIOCSPTLCK/%TIOCGPTLCK logic. * * TTY_NO_WRITE_SPLIT * Prevent driver from splitting up writes into smaller chunks (preserve * write boundaries to driver). * * TTY_HUPPED * The TTY was hung up. This is set post :c:member:`tty_driver.hangup()`. * * TTY_HUPPING * The TTY is in the process of hanging up to abort potential readers. * * TTY_LDISC_CHANGING * Line discipline for this TTY is being changed. I/O should not block * when this is set. Use tty_io_nonblock() to check. * * TTY_LDISC_HALTED * Line discipline for this TTY was stopped. No work should be queued to * this ldisc. */ #define TTY_THROTTLED 0 #define TTY_IO_ERROR 1 #define TTY_OTHER_CLOSED 2 #define TTY_EXCLUSIVE 3 #define TTY_DO_WRITE_WAKEUP 5 #define TTY_LDISC_OPEN 11 #define TTY_PTY_LOCK 16 #define TTY_NO_WRITE_SPLIT 17 #define TTY_HUPPED 18 #define TTY_HUPPING 19 #define TTY_LDISC_CHANGING 20 #define TTY_LDISC_HALTED 22 static inline bool tty_io_nonblock(struct tty_struct *tty, struct file *file) { return file->f_flags & O_NONBLOCK || test_bit(TTY_LDISC_CHANGING, &tty->flags); } static inline bool tty_io_error(struct tty_struct *tty) { return test_bit(TTY_IO_ERROR, &tty->flags); } static inline bool tty_throttled(struct tty_struct *tty) { return test_bit(TTY_THROTTLED, &tty->flags); } #ifdef CONFIG_TTY void tty_kref_put(struct tty_struct *tty); struct pid *tty_get_pgrp(struct tty_struct *tty); void tty_vhangup_self(void); void disassociate_ctty(int priv); dev_t tty_devnum(struct tty_struct *tty); void proc_clear_tty(struct task_struct *p); struct tty_struct *get_current_tty(void); /* tty_io.c */ int __init tty_init(void); const char *tty_name(const struct tty_struct *tty); struct tty_struct *tty_kopen_exclusive(dev_t device); struct tty_struct *tty_kopen_shared(dev_t device); void tty_kclose(struct tty_struct *tty); int tty_dev_name_to_number(const char *name, dev_t *number); #else static inline void tty_kref_put(struct tty_struct *tty) { } static inline struct pid *tty_get_pgrp(struct tty_struct *tty) { return NULL; } static inline void tty_vhangup_self(void) { } static inline void disassociate_ctty(int priv) { } static inline dev_t tty_devnum(struct tty_struct *tty) { return 0; } static inline void proc_clear_tty(struct task_struct *p) { } static inline struct tty_struct *get_current_tty(void) { return NULL; } /* tty_io.c */ static inline int __init tty_init(void) { return 0; } static inline const char *tty_name(const struct tty_struct *tty) { return "(none)"; } static inline struct tty_struct *tty_kopen_exclusive(dev_t device) { return ERR_PTR(-ENODEV); } static inline void tty_kclose(struct tty_struct *tty) { } static inline int tty_dev_name_to_number(const char *name, dev_t *number) { return -ENOTSUPP; } #endif extern struct ktermios tty_std_termios; int vcs_init(void); extern const struct class tty_class; /** * tty_kref_get - get a tty reference * @tty: tty device * * Returns: a new reference to a tty object * * Locking: The caller must hold sufficient locks/counts to ensure that their * existing reference cannot go away. */ static inline struct tty_struct *tty_kref_get(struct tty_struct *tty) { if (tty) kref_get(&tty->kref); return tty; } const char *tty_driver_name(const struct tty_struct *tty); void tty_wait_until_sent(struct tty_struct *tty, long timeout); void stop_tty(struct tty_struct *tty); void start_tty(struct tty_struct *tty); void tty_write_message(struct tty_struct *tty, char *msg); int tty_send_xchar(struct tty_struct *tty, u8 ch); int tty_put_char(struct tty_struct *tty, u8 c); unsigned int tty_chars_in_buffer(struct tty_struct *tty); unsigned int tty_write_room(struct tty_struct *tty); void tty_driver_flush_buffer(struct tty_struct *tty); void tty_unthrottle(struct tty_struct *tty); bool tty_throttle_safe(struct tty_struct *tty); bool tty_unthrottle_safe(struct tty_struct *tty); int tty_do_resize(struct tty_struct *tty, struct winsize *ws); int tty_get_icount(struct tty_struct *tty, struct serial_icounter_struct *icount); int tty_get_tiocm(struct tty_struct *tty); int is_current_pgrp_orphaned(void); void tty_hangup(struct tty_struct *tty); void tty_vhangup(struct tty_struct *tty); int tty_hung_up_p(struct file *filp); void do_SAK(struct tty_struct *tty); void __do_SAK(struct tty_struct *tty); void no_tty(void); speed_t tty_termios_baud_rate(const struct ktermios *termios); void tty_termios_encode_baud_rate(struct ktermios *termios, speed_t ibaud, speed_t obaud); void tty_encode_baud_rate(struct tty_struct *tty, speed_t ibaud, speed_t obaud); /** * tty_get_baud_rate - get tty bit rates * @tty: tty to query * * Returns: the baud rate as an integer for this terminal * * Locking: The termios lock must be held by the caller. */ static inline speed_t tty_get_baud_rate(const struct tty_struct *tty) { return tty_termios_baud_rate(&tty->termios); } unsigned char tty_get_char_size(unsigned int cflag); unsigned char tty_get_frame_size(unsigned int cflag); void tty_termios_copy_hw(struct ktermios *new, const struct ktermios *old); bool tty_termios_hw_change(const struct ktermios *a, const struct ktermios *b); int tty_set_termios(struct tty_struct *tty, struct ktermios *kt); void tty_wakeup(struct tty_struct *tty); int tty_mode_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg); int tty_perform_flush(struct tty_struct *tty, unsigned long arg); struct tty_struct *tty_init_dev(struct tty_driver *driver, int idx); void tty_release_struct(struct tty_struct *tty, int idx); void tty_init_termios(struct tty_struct *tty); void tty_save_termios(struct tty_struct *tty); int tty_standard_install(struct tty_driver *driver, struct tty_struct *tty); extern struct mutex tty_mutex; /* n_tty.c */ void n_tty_inherit_ops(struct tty_ldisc_ops *ops); #ifdef CONFIG_TTY void __init n_tty_init(void); #else static inline void n_tty_init(void) { } #endif /* tty_audit.c */ #ifdef CONFIG_AUDIT void tty_audit_exit(void); void tty_audit_fork(struct signal_struct *sig); int tty_audit_push(void); #else static inline void tty_audit_exit(void) { } static inline void tty_audit_fork(struct signal_struct *sig) { } static inline int tty_audit_push(void) { return 0; } #endif /* tty_ioctl.c */ int n_tty_ioctl_helper(struct tty_struct *tty, unsigned int cmd, unsigned long arg); /* vt.c */ int vt_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg); long vt_compat_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg); /* tty_mutex.c */ /* functions for preparation of BKL removal */ void tty_lock(struct tty_struct *tty); int tty_lock_interruptible(struct tty_struct *tty); void tty_unlock(struct tty_struct *tty); void tty_lock_slave(struct tty_struct *tty); void tty_unlock_slave(struct tty_struct *tty); void tty_set_lock_subclass(struct tty_struct *tty); #endif |
30 30 13 17 6 5 29 31 5 5 25 25 20 20 34 7 12 6 10 20 13 8 32 3 2 2 20 3 9 8 12 12 7 1 5 1 52 7 45 45 2 7 39 21 5 1 15 1 14 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 | // SPDX-License-Identifier: GPL-2.0-only /* * fs/eventfd.c * * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org> * */ #include <linux/file.h> #include <linux/poll.h> #include <linux/init.h> #include <linux/fs.h> #include <linux/sched/signal.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/anon_inodes.h> #include <linux/syscalls.h> #include <linux/export.h> #include <linux/kref.h> #include <linux/eventfd.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/idr.h> #include <linux/uio.h> static DEFINE_IDA(eventfd_ida); struct eventfd_ctx { struct kref kref; wait_queue_head_t wqh; /* * Every time that a write(2) is performed on an eventfd, the * value of the __u64 being written is added to "count" and a * wakeup is performed on "wqh". If EFD_SEMAPHORE flag was not * specified, a read(2) will return the "count" value to userspace, * and will reset "count" to zero. The kernel side eventfd_signal() * also, adds to the "count" counter and issue a wakeup. */ __u64 count; unsigned int flags; int id; }; /** * eventfd_signal_mask - Increment the event counter * @ctx: [in] Pointer to the eventfd context. * @mask: [in] poll mask * * This function is supposed to be called by the kernel in paths that do not * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX * value, and we signal this as overflow condition by returning a EPOLLERR * to poll(2). */ void eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask) { unsigned long flags; /* * Deadlock or stack overflow issues can happen if we recurse here * through waitqueue wakeup handlers. If the caller users potentially * nested waitqueues with custom wakeup handlers, then it should * check eventfd_signal_allowed() before calling this function. If * it returns false, the eventfd_signal() call should be deferred to a * safe context. */ if (WARN_ON_ONCE(current->in_eventfd)) return; spin_lock_irqsave(&ctx->wqh.lock, flags); current->in_eventfd = 1; if (ctx->count < ULLONG_MAX) ctx->count++; if (waitqueue_active(&ctx->wqh)) wake_up_locked_poll(&ctx->wqh, EPOLLIN | mask); current->in_eventfd = 0; spin_unlock_irqrestore(&ctx->wqh.lock, flags); } EXPORT_SYMBOL_GPL(eventfd_signal_mask); static void eventfd_free_ctx(struct eventfd_ctx *ctx) { if (ctx->id >= 0) ida_free(&eventfd_ida, ctx->id); kfree(ctx); } static void eventfd_free(struct kref *kref) { struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref); eventfd_free_ctx(ctx); } /** * eventfd_ctx_put - Releases a reference to the internal eventfd context. * @ctx: [in] Pointer to eventfd context. * * The eventfd context reference must have been previously acquired either * with eventfd_ctx_fdget() or eventfd_ctx_fileget(). */ void eventfd_ctx_put(struct eventfd_ctx *ctx) { kref_put(&ctx->kref, eventfd_free); } EXPORT_SYMBOL_GPL(eventfd_ctx_put); static int eventfd_release(struct inode *inode, struct file *file) { struct eventfd_ctx *ctx = file->private_data; wake_up_poll(&ctx->wqh, EPOLLHUP); eventfd_ctx_put(ctx); return 0; } static __poll_t eventfd_poll(struct file *file, poll_table *wait) { struct eventfd_ctx *ctx = file->private_data; __poll_t events = 0; u64 count; poll_wait(file, &ctx->wqh, wait); /* * All writes to ctx->count occur within ctx->wqh.lock. This read * can be done outside ctx->wqh.lock because we know that poll_wait * takes that lock (through add_wait_queue) if our caller will sleep. * * The read _can_ therefore seep into add_wait_queue's critical * section, but cannot move above it! add_wait_queue's spin_lock acts * as an acquire barrier and ensures that the read be ordered properly * against the writes. The following CAN happen and is safe: * * poll write * ----------------- ------------ * lock ctx->wqh.lock (in poll_wait) * count = ctx->count * __add_wait_queue * unlock ctx->wqh.lock * lock ctx->qwh.lock * ctx->count += n * if (waitqueue_active) * wake_up_locked_poll * unlock ctx->qwh.lock * eventfd_poll returns 0 * * but the following, which would miss a wakeup, cannot happen: * * poll write * ----------------- ------------ * count = ctx->count (INVALID!) * lock ctx->qwh.lock * ctx->count += n * **waitqueue_active is false** * **no wake_up_locked_poll!** * unlock ctx->qwh.lock * lock ctx->wqh.lock (in poll_wait) * __add_wait_queue * unlock ctx->wqh.lock * eventfd_poll returns 0 */ count = READ_ONCE(ctx->count); if (count > 0) events |= EPOLLIN; if (count == ULLONG_MAX) events |= EPOLLERR; if (ULLONG_MAX - 1 > count) events |= EPOLLOUT; return events; } void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt) { lockdep_assert_held(&ctx->wqh.lock); *cnt = ((ctx->flags & EFD_SEMAPHORE) && ctx->count) ? 1 : ctx->count; ctx->count -= *cnt; } EXPORT_SYMBOL_GPL(eventfd_ctx_do_read); /** * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue. * @ctx: [in] Pointer to eventfd context. * @wait: [in] Wait queue to be removed. * @cnt: [out] Pointer to the 64-bit counter value. * * Returns %0 if successful, or the following error codes: * * -EAGAIN : The operation would have blocked. * * This is used to atomically remove a wait queue entry from the eventfd wait * queue head, and read/reset the counter value. */ int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait, __u64 *cnt) { unsigned long flags; spin_lock_irqsave(&ctx->wqh.lock, flags); eventfd_ctx_do_read(ctx, cnt); __remove_wait_queue(&ctx->wqh, wait); if (*cnt != 0 && waitqueue_active(&ctx->wqh)) wake_up_locked_poll(&ctx->wqh, EPOLLOUT); spin_unlock_irqrestore(&ctx->wqh.lock, flags); return *cnt != 0 ? 0 : -EAGAIN; } EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue); static ssize_t eventfd_read(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct eventfd_ctx *ctx = file->private_data; __u64 ucnt = 0; if (iov_iter_count(to) < sizeof(ucnt)) return -EINVAL; spin_lock_irq(&ctx->wqh.lock); if (!ctx->count) { if ((file->f_flags & O_NONBLOCK) || (iocb->ki_flags & IOCB_NOWAIT)) { spin_unlock_irq(&ctx->wqh.lock); return -EAGAIN; } if (wait_event_interruptible_locked_irq(ctx->wqh, ctx->count)) { spin_unlock_irq(&ctx->wqh.lock); return -ERESTARTSYS; } } eventfd_ctx_do_read(ctx, &ucnt); current->in_eventfd = 1; if (waitqueue_active(&ctx->wqh)) wake_up_locked_poll(&ctx->wqh, EPOLLOUT); current->in_eventfd = 0; spin_unlock_irq(&ctx->wqh.lock); if (unlikely(copy_to_iter(&ucnt, sizeof(ucnt), to) != sizeof(ucnt))) return -EFAULT; return sizeof(ucnt); } static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct eventfd_ctx *ctx = file->private_data; ssize_t res; __u64 ucnt; if (count < sizeof(ucnt)) return -EINVAL; if (copy_from_user(&ucnt, buf, sizeof(ucnt))) return -EFAULT; if (ucnt == ULLONG_MAX) return -EINVAL; spin_lock_irq(&ctx->wqh.lock); res = -EAGAIN; if (ULLONG_MAX - ctx->count > ucnt) res = sizeof(ucnt); else if (!(file->f_flags & O_NONBLOCK)) { res = wait_event_interruptible_locked_irq(ctx->wqh, ULLONG_MAX - ctx->count > ucnt); if (!res) res = sizeof(ucnt); } if (likely(res > 0)) { ctx->count += ucnt; current->in_eventfd = 1; if (waitqueue_active(&ctx->wqh)) wake_up_locked_poll(&ctx->wqh, EPOLLIN); current->in_eventfd = 0; } spin_unlock_irq(&ctx->wqh.lock); return res; } #ifdef CONFIG_PROC_FS static void eventfd_show_fdinfo(struct seq_file *m, struct file *f) { struct eventfd_ctx *ctx = f->private_data; spin_lock_irq(&ctx->wqh.lock); seq_printf(m, "eventfd-count: %16llx\n", (unsigned long long)ctx->count); spin_unlock_irq(&ctx->wqh.lock); seq_printf(m, "eventfd-id: %d\n", ctx->id); seq_printf(m, "eventfd-semaphore: %d\n", !!(ctx->flags & EFD_SEMAPHORE)); } #endif static const struct file_operations eventfd_fops = { #ifdef CONFIG_PROC_FS .show_fdinfo = eventfd_show_fdinfo, #endif .release = eventfd_release, .poll = eventfd_poll, .read_iter = eventfd_read, .write = eventfd_write, .llseek = noop_llseek, }; /** * eventfd_fget - Acquire a reference of an eventfd file descriptor. * @fd: [in] Eventfd file descriptor. * * Returns a pointer to the eventfd file structure in case of success, or the * following error pointer: * * -EBADF : Invalid @fd file descriptor. * -EINVAL : The @fd file descriptor is not an eventfd file. */ struct file *eventfd_fget(int fd) { struct file *file; file = fget(fd); if (!file) return ERR_PTR(-EBADF); if (file->f_op != &eventfd_fops) { fput(file); return ERR_PTR(-EINVAL); } return file; } EXPORT_SYMBOL_GPL(eventfd_fget); /** * eventfd_ctx_fdget - Acquires a reference to the internal eventfd context. * @fd: [in] Eventfd file descriptor. * * Returns a pointer to the internal eventfd context, otherwise the error * pointers returned by the following functions: * * eventfd_fget */ struct eventfd_ctx *eventfd_ctx_fdget(int fd) { struct eventfd_ctx *ctx; struct fd f = fdget(fd); if (!f.file) return ERR_PTR(-EBADF); ctx = eventfd_ctx_fileget(f.file); fdput(f); return ctx; } EXPORT_SYMBOL_GPL(eventfd_ctx_fdget); /** * eventfd_ctx_fileget - Acquires a reference to the internal eventfd context. * @file: [in] Eventfd file pointer. * * Returns a pointer to the internal eventfd context, otherwise the error * pointer: * * -EINVAL : The @fd file descriptor is not an eventfd file. */ struct eventfd_ctx *eventfd_ctx_fileget(struct file *file) { struct eventfd_ctx *ctx; if (file->f_op != &eventfd_fops) return ERR_PTR(-EINVAL); ctx = file->private_data; kref_get(&ctx->kref); return ctx; } EXPORT_SYMBOL_GPL(eventfd_ctx_fileget); static int do_eventfd(unsigned int count, int flags) { struct eventfd_ctx *ctx; struct file *file; int fd; /* Check the EFD_* constants for consistency. */ BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC); BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK); if (flags & ~EFD_FLAGS_SET) return -EINVAL; ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return -ENOMEM; kref_init(&ctx->kref); init_waitqueue_head(&ctx->wqh); ctx->count = count; ctx->flags = flags; ctx->id = ida_alloc(&eventfd_ida, GFP_KERNEL); flags &= EFD_SHARED_FCNTL_FLAGS; flags |= O_RDWR; fd = get_unused_fd_flags(flags); if (fd < 0) goto err; file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx, flags); if (IS_ERR(file)) { put_unused_fd(fd); fd = PTR_ERR(file); goto err; } file->f_mode |= FMODE_NOWAIT; fd_install(fd, file); return fd; err: eventfd_free_ctx(ctx); return fd; } SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags) { return do_eventfd(count, flags); } SYSCALL_DEFINE1(eventfd, unsigned int, count) { return do_eventfd(count, 0); } |
149 16 28 46 46 5 5 20 20 20 20 20 20 16 6 8 12 20 20 20 24 18 20 18 18 17 6 8 8 19 1 36 30 25 25 1 6 1 37 36 12 24 24 151 121 28 2 149 149 148 1 42 5 3 38 48 38 139 10 575 606 576 76 72 4 56 38 38 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 | // SPDX-License-Identifier: GPL-2.0-or-later /* Request a key from userspace * * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * See Documentation/security/keys/request-key.rst */ #include <linux/export.h> #include <linux/sched.h> #include <linux/kmod.h> #include <linux/err.h> #include <linux/keyctl.h> #include <linux/slab.h> #include <net/net_namespace.h> #include "internal.h" #include <keys/request_key_auth-type.h> #define key_negative_timeout 60 /* default timeout on a negative key's existence */ static struct key *check_cached_key(struct keyring_search_context *ctx) { #ifdef CONFIG_KEYS_REQUEST_CACHE struct key *key = current->cached_requested_key; if (key && ctx->match_data.cmp(key, &ctx->match_data) && !(key->flags & ((1 << KEY_FLAG_INVALIDATED) | (1 << KEY_FLAG_REVOKED)))) return key_get(key); #endif return NULL; } static void cache_requested_key(struct key *key) { #ifdef CONFIG_KEYS_REQUEST_CACHE struct task_struct *t = current; /* Do not cache key if it is a kernel thread */ if (!(t->flags & PF_KTHREAD)) { key_put(t->cached_requested_key); t->cached_requested_key = key_get(key); set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); } #endif } /** * complete_request_key - Complete the construction of a key. * @authkey: The authorisation key. * @error: The success or failute of the construction. * * Complete the attempt to construct a key. The key will be negated * if an error is indicated. The authorisation key will be revoked * unconditionally. */ void complete_request_key(struct key *authkey, int error) { struct request_key_auth *rka = get_request_key_auth(authkey); struct key *key = rka->target_key; kenter("%d{%d},%d", authkey->serial, key->serial, error); if (error < 0) key_negate_and_link(key, key_negative_timeout, NULL, authkey); else key_revoke(authkey); } EXPORT_SYMBOL(complete_request_key); /* * Initialise a usermode helper that is going to have a specific session * keyring. * * This is called in context of freshly forked kthread before kernel_execve(), * so we can simply install the desired session_keyring at this point. */ static int umh_keys_init(struct subprocess_info *info, struct cred *cred) { struct key *keyring = info->data; return install_session_keyring_to_cred(cred, keyring); } /* * Clean up a usermode helper with session keyring. */ static void umh_keys_cleanup(struct subprocess_info *info) { struct key *keyring = info->data; key_put(keyring); } /* * Call a usermode helper with a specific session keyring. */ static int call_usermodehelper_keys(const char *path, char **argv, char **envp, struct key *session_keyring, int wait) { struct subprocess_info *info; info = call_usermodehelper_setup(path, argv, envp, GFP_KERNEL, umh_keys_init, umh_keys_cleanup, session_keyring); if (!info) return -ENOMEM; key_get(session_keyring); return call_usermodehelper_exec(info, wait); } /* * Request userspace finish the construction of a key * - execute "/sbin/request-key <op> <key> <uid> <gid> <keyring> <keyring> <keyring>" */ static int call_sbin_request_key(struct key *authkey, void *aux) { static char const request_key[] = "/sbin/request-key"; struct request_key_auth *rka = get_request_key_auth(authkey); const struct cred *cred = current_cred(); key_serial_t prkey, sskey; struct key *key = rka->target_key, *keyring, *session, *user_session; char *argv[9], *envp[3], uid_str[12], gid_str[12]; char key_str[12], keyring_str[3][12]; char desc[20]; int ret, i; kenter("{%d},{%d},%s", key->serial, authkey->serial, rka->op); ret = look_up_user_keyrings(NULL, &user_session); if (ret < 0) goto error_us; /* allocate a new session keyring */ sprintf(desc, "_req.%u", key->serial); cred = get_current_cred(); keyring = keyring_alloc(desc, cred->fsuid, cred->fsgid, cred, KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ, KEY_ALLOC_QUOTA_OVERRUN, NULL, NULL); put_cred(cred); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto error_alloc; } /* attach the auth key to the session keyring */ ret = key_link(keyring, authkey); if (ret < 0) goto error_link; /* record the UID and GID */ sprintf(uid_str, "%d", from_kuid(&init_user_ns, cred->fsuid)); sprintf(gid_str, "%d", from_kgid(&init_user_ns, cred->fsgid)); /* we say which key is under construction */ sprintf(key_str, "%d", key->serial); /* we specify the process's default keyrings */ sprintf(keyring_str[0], "%d", cred->thread_keyring ? cred->thread_keyring->serial : 0); prkey = 0; if (cred->process_keyring) prkey = cred->process_keyring->serial; sprintf(keyring_str[1], "%d", prkey); session = cred->session_keyring; if (!session) session = user_session; sskey = session->serial; sprintf(keyring_str[2], "%d", sskey); /* set up a minimal environment */ i = 0; envp[i++] = "HOME=/"; envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; envp[i] = NULL; /* set up the argument list */ i = 0; argv[i++] = (char *)request_key; argv[i++] = (char *)rka->op; argv[i++] = key_str; argv[i++] = uid_str; argv[i++] = gid_str; argv[i++] = keyring_str[0]; argv[i++] = keyring_str[1]; argv[i++] = keyring_str[2]; argv[i] = NULL; /* do it */ ret = call_usermodehelper_keys(request_key, argv, envp, keyring, UMH_WAIT_PROC); kdebug("usermode -> 0x%x", ret); if (ret >= 0) { /* ret is the exit/wait code */ if (test_bit(KEY_FLAG_USER_CONSTRUCT, &key->flags) || key_validate(key) < 0) ret = -ENOKEY; else /* ignore any errors from userspace if the key was * instantiated */ ret = 0; } error_link: key_put(keyring); error_alloc: key_put(user_session); error_us: complete_request_key(authkey, ret); kleave(" = %d", ret); return ret; } /* * Call out to userspace for key construction. * * Program failure is ignored in favour of key status. */ static int construct_key(struct key *key, const void *callout_info, size_t callout_len, void *aux, struct key *dest_keyring) { request_key_actor_t actor; struct key *authkey; int ret; kenter("%d,%p,%zu,%p", key->serial, callout_info, callout_len, aux); /* allocate an authorisation key */ authkey = request_key_auth_new(key, "create", callout_info, callout_len, dest_keyring); if (IS_ERR(authkey)) return PTR_ERR(authkey); /* Make the call */ actor = call_sbin_request_key; if (key->type->request_key) actor = key->type->request_key; ret = actor(authkey, aux); /* check that the actor called complete_request_key() prior to * returning an error */ WARN_ON(ret < 0 && !test_bit(KEY_FLAG_INVALIDATED, &authkey->flags)); key_put(authkey); kleave(" = %d", ret); return ret; } /* * Get the appropriate destination keyring for the request. * * The keyring selected is returned with an extra reference upon it which the * caller must release. */ static int construct_get_dest_keyring(struct key **_dest_keyring) { struct request_key_auth *rka; const struct cred *cred = current_cred(); struct key *dest_keyring = *_dest_keyring, *authkey; int ret; kenter("%p", dest_keyring); /* find the appropriate keyring */ if (dest_keyring) { /* the caller supplied one */ key_get(dest_keyring); } else { bool do_perm_check = true; /* use a default keyring; falling through the cases until we * find one that we actually have */ switch (cred->jit_keyring) { case KEY_REQKEY_DEFL_DEFAULT: case KEY_REQKEY_DEFL_REQUESTOR_KEYRING: if (cred->request_key_auth) { authkey = cred->request_key_auth; down_read(&authkey->sem); rka = get_request_key_auth(authkey); if (!test_bit(KEY_FLAG_REVOKED, &authkey->flags)) dest_keyring = key_get(rka->dest_keyring); up_read(&authkey->sem); if (dest_keyring) { do_perm_check = false; break; } } fallthrough; case KEY_REQKEY_DEFL_THREAD_KEYRING: dest_keyring = key_get(cred->thread_keyring); if (dest_keyring) break; fallthrough; case KEY_REQKEY_DEFL_PROCESS_KEYRING: dest_keyring = key_get(cred->process_keyring); if (dest_keyring) break; fallthrough; case KEY_REQKEY_DEFL_SESSION_KEYRING: dest_keyring = key_get(cred->session_keyring); if (dest_keyring) break; fallthrough; case KEY_REQKEY_DEFL_USER_SESSION_KEYRING: ret = look_up_user_keyrings(NULL, &dest_keyring); if (ret < 0) return ret; break; case KEY_REQKEY_DEFL_USER_KEYRING: ret = look_up_user_keyrings(&dest_keyring, NULL); if (ret < 0) return ret; break; case KEY_REQKEY_DEFL_GROUP_KEYRING: default: BUG(); } /* * Require Write permission on the keyring. This is essential * because the default keyring may be the session keyring, and * joining a keyring only requires Search permission. * * However, this check is skipped for the "requestor keyring" so * that /sbin/request-key can itself use request_key() to add * keys to the original requestor's destination keyring. */ if (dest_keyring && do_perm_check) { ret = key_permission(make_key_ref(dest_keyring, 1), KEY_NEED_WRITE); if (ret) { key_put(dest_keyring); return ret; } } } *_dest_keyring = dest_keyring; kleave(" [dk %d]", key_serial(dest_keyring)); return 0; } /* * Allocate a new key in under-construction state and attempt to link it in to * the requested keyring. * * May return a key that's already under construction instead if there was a * race between two thread calling request_key(). */ static int construct_alloc_key(struct keyring_search_context *ctx, struct key *dest_keyring, unsigned long flags, struct key_user *user, struct key **_key) { struct assoc_array_edit *edit = NULL; struct key *key; key_perm_t perm; key_ref_t key_ref; int ret; kenter("%s,%s,,,", ctx->index_key.type->name, ctx->index_key.description); *_key = NULL; mutex_lock(&user->cons_lock); perm = KEY_POS_VIEW | KEY_POS_SEARCH | KEY_POS_LINK | KEY_POS_SETATTR; perm |= KEY_USR_VIEW; if (ctx->index_key.type->read) perm |= KEY_POS_READ; if (ctx->index_key.type == &key_type_keyring || ctx->index_key.type->update) perm |= KEY_POS_WRITE; key = key_alloc(ctx->index_key.type, ctx->index_key.description, ctx->cred->fsuid, ctx->cred->fsgid, ctx->cred, perm, flags, NULL); if (IS_ERR(key)) goto alloc_failed; set_bit(KEY_FLAG_USER_CONSTRUCT, &key->flags); if (dest_keyring) { ret = __key_link_lock(dest_keyring, &key->index_key); if (ret < 0) goto link_lock_failed; } /* * Attach the key to the destination keyring under lock, but we do need * to do another check just in case someone beat us to it whilst we * waited for locks. * * The caller might specify a comparison function which looks for keys * that do not exactly match but are still equivalent from the caller's * perspective. The __key_link_begin() operation must be done only after * an actual key is determined. */ mutex_lock(&key_construction_mutex); rcu_read_lock(); key_ref = search_process_keyrings_rcu(ctx); rcu_read_unlock(); if (!IS_ERR(key_ref)) goto key_already_present; if (dest_keyring) { ret = __key_link_begin(dest_keyring, &key->index_key, &edit); if (ret < 0) goto link_alloc_failed; __key_link(dest_keyring, key, &edit); } mutex_unlock(&key_construction_mutex); if (dest_keyring) __key_link_end(dest_keyring, &key->index_key, edit); mutex_unlock(&user->cons_lock); *_key = key; kleave(" = 0 [%d]", key_serial(key)); return 0; /* the key is now present - we tell the caller that we found it by * returning -EINPROGRESS */ key_already_present: key_put(key); mutex_unlock(&key_construction_mutex); key = key_ref_to_ptr(key_ref); if (dest_keyring) { ret = __key_link_begin(dest_keyring, &key->index_key, &edit); if (ret < 0) goto link_alloc_failed_unlocked; ret = __key_link_check_live_key(dest_keyring, key); if (ret == 0) __key_link(dest_keyring, key, &edit); __key_link_end(dest_keyring, &key->index_key, edit); if (ret < 0) goto link_check_failed; } mutex_unlock(&user->cons_lock); *_key = key; kleave(" = -EINPROGRESS [%d]", key_serial(key)); return -EINPROGRESS; link_check_failed: mutex_unlock(&user->cons_lock); key_put(key); kleave(" = %d [linkcheck]", ret); return ret; link_alloc_failed: mutex_unlock(&key_construction_mutex); link_alloc_failed_unlocked: __key_link_end(dest_keyring, &key->index_key, edit); link_lock_failed: mutex_unlock(&user->cons_lock); key_put(key); kleave(" = %d [prelink]", ret); return ret; alloc_failed: mutex_unlock(&user->cons_lock); kleave(" = %ld", PTR_ERR(key)); return PTR_ERR(key); } /* * Commence key construction. */ static struct key *construct_key_and_link(struct keyring_search_context *ctx, const char *callout_info, size_t callout_len, void *aux, struct key *dest_keyring, unsigned long flags) { struct key_user *user; struct key *key; int ret; kenter(""); if (ctx->index_key.type == &key_type_keyring) return ERR_PTR(-EPERM); ret = construct_get_dest_keyring(&dest_keyring); if (ret) goto error; user = key_user_lookup(current_fsuid()); if (!user) { ret = -ENOMEM; goto error_put_dest_keyring; } ret = construct_alloc_key(ctx, dest_keyring, flags, user, &key); key_user_put(user); if (ret == 0) { ret = construct_key(key, callout_info, callout_len, aux, dest_keyring); if (ret < 0) { kdebug("cons failed"); goto construction_failed; } } else if (ret == -EINPROGRESS) { ret = 0; } else { goto error_put_dest_keyring; } key_put(dest_keyring); kleave(" = key %d", key_serial(key)); return key; construction_failed: key_negate_and_link(key, key_negative_timeout, NULL, NULL); key_put(key); error_put_dest_keyring: key_put(dest_keyring); error: kleave(" = %d", ret); return ERR_PTR(ret); } /** * request_key_and_link - Request a key and cache it in a keyring. * @type: The type of key we want. * @description: The searchable description of the key. * @domain_tag: The domain in which the key operates. * @callout_info: The data to pass to the instantiation upcall (or NULL). * @callout_len: The length of callout_info. * @aux: Auxiliary data for the upcall. * @dest_keyring: Where to cache the key. * @flags: Flags to key_alloc(). * * A key matching the specified criteria (type, description, domain_tag) is * searched for in the process's keyrings and returned with its usage count * incremented if found. Otherwise, if callout_info is not NULL, a key will be * allocated and some service (probably in userspace) will be asked to * instantiate it. * * If successfully found or created, the key will be linked to the destination * keyring if one is provided. * * Returns a pointer to the key if successful; -EACCES, -ENOKEY, -EKEYREVOKED * or -EKEYEXPIRED if an inaccessible, negative, revoked or expired key was * found; -ENOKEY if no key was found and no @callout_info was given; -EDQUOT * if insufficient key quota was available to create a new key; or -ENOMEM if * insufficient memory was available. * * If the returned key was created, then it may still be under construction, * and wait_for_key_construction() should be used to wait for that to complete. */ struct key *request_key_and_link(struct key_type *type, const char *description, struct key_tag *domain_tag, const void *callout_info, size_t callout_len, void *aux, struct key *dest_keyring, unsigned long flags) { struct keyring_search_context ctx = { .index_key.type = type, .index_key.domain_tag = domain_tag, .index_key.description = description, .index_key.desc_len = strlen(description), .cred = current_cred(), .match_data.cmp = key_default_cmp, .match_data.raw_data = description, .match_data.lookup_type = KEYRING_SEARCH_LOOKUP_DIRECT, .flags = (KEYRING_SEARCH_DO_STATE_CHECK | KEYRING_SEARCH_SKIP_EXPIRED | KEYRING_SEARCH_RECURSE), }; struct key *key; key_ref_t key_ref; int ret; kenter("%s,%s,%p,%zu,%p,%p,%lx", ctx.index_key.type->name, ctx.index_key.description, callout_info, callout_len, aux, dest_keyring, flags); if (type->match_preparse) { ret = type->match_preparse(&ctx.match_data); if (ret < 0) { key = ERR_PTR(ret); goto error; } } key = check_cached_key(&ctx); if (key) goto error_free; /* search all the process keyrings for a key */ rcu_read_lock(); key_ref = search_process_keyrings_rcu(&ctx); rcu_read_unlock(); if (!IS_ERR(key_ref)) { if (dest_keyring) { ret = key_task_permission(key_ref, current_cred(), KEY_NEED_LINK); if (ret < 0) { key_ref_put(key_ref); key = ERR_PTR(ret); goto error_free; } } key = key_ref_to_ptr(key_ref); if (dest_keyring) { ret = key_link(dest_keyring, key); if (ret < 0) { key_put(key); key = ERR_PTR(ret); goto error_free; } } /* Only cache the key on immediate success */ cache_requested_key(key); } else if (PTR_ERR(key_ref) != -EAGAIN) { key = ERR_CAST(key_ref); } else { /* the search failed, but the keyrings were searchable, so we * should consult userspace if we can */ key = ERR_PTR(-ENOKEY); if (!callout_info) goto error_free; key = construct_key_and_link(&ctx, callout_info, callout_len, aux, dest_keyring, flags); } error_free: if (type->match_free) type->match_free(&ctx.match_data); error: kleave(" = %p", key); return key; } /** * wait_for_key_construction - Wait for construction of a key to complete * @key: The key being waited for. * @intr: Whether to wait interruptibly. * * Wait for a key to finish being constructed. * * Returns 0 if successful; -ERESTARTSYS if the wait was interrupted; -ENOKEY * if the key was negated; or -EKEYREVOKED or -EKEYEXPIRED if the key was * revoked or expired. */ int wait_for_key_construction(struct key *key, bool intr) { int ret; ret = wait_on_bit(&key->flags, KEY_FLAG_USER_CONSTRUCT, intr ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE); if (ret) return -ERESTARTSYS; ret = key_read_state(key); if (ret < 0) return ret; return key_validate(key); } EXPORT_SYMBOL(wait_for_key_construction); /** * request_key_tag - Request a key and wait for construction * @type: Type of key. * @description: The searchable description of the key. * @domain_tag: The domain in which the key operates. * @callout_info: The data to pass to the instantiation upcall (or NULL). * * As for request_key_and_link() except that it does not add the returned key * to a keyring if found, new keys are always allocated in the user's quota, * the callout_info must be a NUL-terminated string and no auxiliary data can * be passed. * * Furthermore, it then works as wait_for_key_construction() to wait for the * completion of keys undergoing construction with a non-interruptible wait. */ struct key *request_key_tag(struct key_type *type, const char *description, struct key_tag *domain_tag, const char *callout_info) { struct key *key; size_t callout_len = 0; int ret; if (callout_info) callout_len = strlen(callout_info); key = request_key_and_link(type, description, domain_tag, callout_info, callout_len, NULL, NULL, KEY_ALLOC_IN_QUOTA); if (!IS_ERR(key)) { ret = wait_for_key_construction(key, false); if (ret < 0) { key_put(key); return ERR_PTR(ret); } } return key; } EXPORT_SYMBOL(request_key_tag); /** * request_key_with_auxdata - Request a key with auxiliary data for the upcaller * @type: The type of key we want. * @description: The searchable description of the key. * @domain_tag: The domain in which the key operates. * @callout_info: The data to pass to the instantiation upcall (or NULL). * @callout_len: The length of callout_info. * @aux: Auxiliary data for the upcall. * * As for request_key_and_link() except that it does not add the returned key * to a keyring if found and new keys are always allocated in the user's quota. * * Furthermore, it then works as wait_for_key_construction() to wait for the * completion of keys undergoing construction with a non-interruptible wait. */ struct key *request_key_with_auxdata(struct key_type *type, const char *description, struct key_tag *domain_tag, const void *callout_info, size_t callout_len, void *aux) { struct key *key; int ret; key = request_key_and_link(type, description, domain_tag, callout_info, callout_len, aux, NULL, KEY_ALLOC_IN_QUOTA); if (!IS_ERR(key)) { ret = wait_for_key_construction(key, false); if (ret < 0) { key_put(key); return ERR_PTR(ret); } } return key; } EXPORT_SYMBOL(request_key_with_auxdata); /** * request_key_rcu - Request key from RCU-read-locked context * @type: The type of key we want. * @description: The name of the key we want. * @domain_tag: The domain in which the key operates. * * Request a key from a context that we may not sleep in (such as RCU-mode * pathwalk). Keys under construction are ignored. * * Return a pointer to the found key if successful, -ENOKEY if we couldn't find * a key or some other error if the key found was unsuitable or inaccessible. */ struct key *request_key_rcu(struct key_type *type, const char *description, struct key_tag *domain_tag) { struct keyring_search_context ctx = { .index_key.type = type, .index_key.domain_tag = domain_tag, .index_key.description = description, .index_key.desc_len = strlen(description), .cred = current_cred(), .match_data.cmp = key_default_cmp, .match_data.raw_data = description, .match_data.lookup_type = KEYRING_SEARCH_LOOKUP_DIRECT, .flags = (KEYRING_SEARCH_DO_STATE_CHECK | KEYRING_SEARCH_SKIP_EXPIRED), }; struct key *key; key_ref_t key_ref; kenter("%s,%s", type->name, description); key = check_cached_key(&ctx); if (key) return key; /* search all the process keyrings for a key */ key_ref = search_process_keyrings_rcu(&ctx); if (IS_ERR(key_ref)) { key = ERR_CAST(key_ref); if (PTR_ERR(key_ref) == -EAGAIN) key = ERR_PTR(-ENOKEY); } else { key = key_ref_to_ptr(key_ref); cache_requested_key(key); } kleave(" = %p", key); return key; } EXPORT_SYMBOL(request_key_rcu); |
1 2 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 | /* * Copyright (c) 2010-2011 Atheros Communications Inc. * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include "htc.h" static const char *wmi_cmd_to_name(enum wmi_cmd_id wmi_cmd) { switch (wmi_cmd) { case WMI_ECHO_CMDID: return "WMI_ECHO_CMDID"; case WMI_ACCESS_MEMORY_CMDID: return "WMI_ACCESS_MEMORY_CMDID"; case WMI_GET_FW_VERSION: return "WMI_GET_FW_VERSION"; case WMI_DISABLE_INTR_CMDID: return "WMI_DISABLE_INTR_CMDID"; case WMI_ENABLE_INTR_CMDID: return "WMI_ENABLE_INTR_CMDID"; case WMI_ATH_INIT_CMDID: return "WMI_ATH_INIT_CMDID"; case WMI_ABORT_TXQ_CMDID: return "WMI_ABORT_TXQ_CMDID"; case WMI_STOP_TX_DMA_CMDID: return "WMI_STOP_TX_DMA_CMDID"; case WMI_ABORT_TX_DMA_CMDID: return "WMI_ABORT_TX_DMA_CMDID"; case WMI_DRAIN_TXQ_CMDID: return "WMI_DRAIN_TXQ_CMDID"; case WMI_DRAIN_TXQ_ALL_CMDID: return "WMI_DRAIN_TXQ_ALL_CMDID"; case WMI_START_RECV_CMDID: return "WMI_START_RECV_CMDID"; case WMI_STOP_RECV_CMDID: return "WMI_STOP_RECV_CMDID"; case WMI_FLUSH_RECV_CMDID: return "WMI_FLUSH_RECV_CMDID"; case WMI_SET_MODE_CMDID: return "WMI_SET_MODE_CMDID"; case WMI_NODE_CREATE_CMDID: return "WMI_NODE_CREATE_CMDID"; case WMI_NODE_REMOVE_CMDID: return "WMI_NODE_REMOVE_CMDID"; case WMI_VAP_REMOVE_CMDID: return "WMI_VAP_REMOVE_CMDID"; case WMI_VAP_CREATE_CMDID: return "WMI_VAP_CREATE_CMDID"; case WMI_REG_READ_CMDID: return "WMI_REG_READ_CMDID"; case WMI_REG_WRITE_CMDID: return "WMI_REG_WRITE_CMDID"; case WMI_REG_RMW_CMDID: return "WMI_REG_RMW_CMDID"; case WMI_RC_STATE_CHANGE_CMDID: return "WMI_RC_STATE_CHANGE_CMDID"; case WMI_RC_RATE_UPDATE_CMDID: return "WMI_RC_RATE_UPDATE_CMDID"; case WMI_TARGET_IC_UPDATE_CMDID: return "WMI_TARGET_IC_UPDATE_CMDID"; case WMI_TX_AGGR_ENABLE_CMDID: return "WMI_TX_AGGR_ENABLE_CMDID"; case WMI_TGT_DETACH_CMDID: return "WMI_TGT_DETACH_CMDID"; case WMI_NODE_UPDATE_CMDID: return "WMI_NODE_UPDATE_CMDID"; case WMI_INT_STATS_CMDID: return "WMI_INT_STATS_CMDID"; case WMI_TX_STATS_CMDID: return "WMI_TX_STATS_CMDID"; case WMI_RX_STATS_CMDID: return "WMI_RX_STATS_CMDID"; case WMI_BITRATE_MASK_CMDID: return "WMI_BITRATE_MASK_CMDID"; } return "Bogus"; } struct wmi *ath9k_init_wmi(struct ath9k_htc_priv *priv) { struct wmi *wmi; wmi = kzalloc(sizeof(struct wmi), GFP_KERNEL); if (!wmi) return NULL; wmi->drv_priv = priv; wmi->stopped = false; skb_queue_head_init(&wmi->wmi_event_queue); spin_lock_init(&wmi->wmi_lock); spin_lock_init(&wmi->event_lock); mutex_init(&wmi->op_mutex); mutex_init(&wmi->multi_write_mutex); mutex_init(&wmi->multi_rmw_mutex); init_completion(&wmi->cmd_wait); INIT_LIST_HEAD(&wmi->pending_tx_events); tasklet_setup(&wmi->wmi_event_tasklet, ath9k_wmi_event_tasklet); return wmi; } void ath9k_stop_wmi(struct ath9k_htc_priv *priv) { struct wmi *wmi = priv->wmi; mutex_lock(&wmi->op_mutex); wmi->stopped = true; mutex_unlock(&wmi->op_mutex); } void ath9k_destroy_wmi(struct ath9k_htc_priv *priv) { kfree(priv->wmi); } void ath9k_wmi_event_drain(struct ath9k_htc_priv *priv) { unsigned long flags; tasklet_kill(&priv->wmi->wmi_event_tasklet); spin_lock_irqsave(&priv->wmi->wmi_lock, flags); __skb_queue_purge(&priv->wmi->wmi_event_queue); spin_unlock_irqrestore(&priv->wmi->wmi_lock, flags); } void ath9k_wmi_event_tasklet(struct tasklet_struct *t) { struct wmi *wmi = from_tasklet(wmi, t, wmi_event_tasklet); struct ath9k_htc_priv *priv = wmi->drv_priv; struct wmi_cmd_hdr *hdr; void *wmi_event; struct wmi_event_swba *swba; struct sk_buff *skb = NULL; unsigned long flags; u16 cmd_id; do { spin_lock_irqsave(&wmi->wmi_lock, flags); skb = __skb_dequeue(&wmi->wmi_event_queue); if (!skb) { spin_unlock_irqrestore(&wmi->wmi_lock, flags); return; } spin_unlock_irqrestore(&wmi->wmi_lock, flags); hdr = (struct wmi_cmd_hdr *) skb->data; cmd_id = be16_to_cpu(hdr->command_id); wmi_event = skb_pull(skb, sizeof(struct wmi_cmd_hdr)); switch (cmd_id) { case WMI_SWBA_EVENTID: swba = wmi_event; ath9k_htc_swba(priv, swba); break; case WMI_FATAL_EVENTID: ieee80211_queue_work(wmi->drv_priv->hw, &wmi->drv_priv->fatal_work); break; case WMI_TXSTATUS_EVENTID: /* Check if ath9k_tx_init() completed. */ if (!data_race(priv->tx.initialized)) break; spin_lock_bh(&priv->tx.tx_lock); if (priv->tx.flags & ATH9K_HTC_OP_TX_DRAIN) { spin_unlock_bh(&priv->tx.tx_lock); break; } spin_unlock_bh(&priv->tx.tx_lock); ath9k_htc_txstatus(priv, wmi_event); break; default: break; } kfree_skb(skb); } while (1); } void ath9k_fatal_work(struct work_struct *work) { struct ath9k_htc_priv *priv = container_of(work, struct ath9k_htc_priv, fatal_work); struct ath_common *common = ath9k_hw_common(priv->ah); ath_dbg(common, FATAL, "FATAL Event received, resetting device\n"); ath9k_htc_reset(priv); } static void ath9k_wmi_rsp_callback(struct wmi *wmi, struct sk_buff *skb) { skb_pull(skb, sizeof(struct wmi_cmd_hdr)); if (wmi->cmd_rsp_buf != NULL && wmi->cmd_rsp_len != 0) memcpy(wmi->cmd_rsp_buf, skb->data, wmi->cmd_rsp_len); complete(&wmi->cmd_wait); } static void ath9k_wmi_ctrl_rx(void *priv, struct sk_buff *skb, enum htc_endpoint_id epid) { struct wmi *wmi = priv; struct wmi_cmd_hdr *hdr; unsigned long flags; u16 cmd_id; if (unlikely(wmi->stopped)) goto free_skb; /* Validate the obtained SKB. */ if (unlikely(skb->len < sizeof(struct wmi_cmd_hdr))) goto free_skb; hdr = (struct wmi_cmd_hdr *) skb->data; cmd_id = be16_to_cpu(hdr->command_id); if (cmd_id & 0x1000) { spin_lock_irqsave(&wmi->wmi_lock, flags); __skb_queue_tail(&wmi->wmi_event_queue, skb); spin_unlock_irqrestore(&wmi->wmi_lock, flags); tasklet_schedule(&wmi->wmi_event_tasklet); return; } /* Check if there has been a timeout. */ spin_lock_irqsave(&wmi->wmi_lock, flags); if (be16_to_cpu(hdr->seq_no) != wmi->last_seq_id) { spin_unlock_irqrestore(&wmi->wmi_lock, flags); goto free_skb; } /* WMI command response */ ath9k_wmi_rsp_callback(wmi, skb); spin_unlock_irqrestore(&wmi->wmi_lock, flags); free_skb: kfree_skb(skb); } static void ath9k_wmi_ctrl_tx(void *priv, struct sk_buff *skb, enum htc_endpoint_id epid, bool txok) { kfree_skb(skb); } int ath9k_wmi_connect(struct htc_target *htc, struct wmi *wmi, enum htc_endpoint_id *wmi_ctrl_epid) { struct htc_service_connreq connect; int ret; wmi->htc = htc; memset(&connect, 0, sizeof(connect)); connect.ep_callbacks.priv = wmi; connect.ep_callbacks.tx = ath9k_wmi_ctrl_tx; connect.ep_callbacks.rx = ath9k_wmi_ctrl_rx; connect.service_id = WMI_CONTROL_SVC; ret = htc_connect_service(htc, &connect, &wmi->ctrl_epid); if (ret) return ret; *wmi_ctrl_epid = wmi->ctrl_epid; return 0; } static int ath9k_wmi_cmd_issue(struct wmi *wmi, struct sk_buff *skb, enum wmi_cmd_id cmd, u16 len, u8 *rsp_buf, u32 rsp_len) { struct wmi_cmd_hdr *hdr; unsigned long flags; hdr = skb_push(skb, sizeof(struct wmi_cmd_hdr)); hdr->command_id = cpu_to_be16(cmd); hdr->seq_no = cpu_to_be16(++wmi->tx_seq_id); spin_lock_irqsave(&wmi->wmi_lock, flags); /* record the rsp buffer and length */ wmi->cmd_rsp_buf = rsp_buf; wmi->cmd_rsp_len = rsp_len; wmi->last_seq_id = wmi->tx_seq_id; spin_unlock_irqrestore(&wmi->wmi_lock, flags); return htc_send_epid(wmi->htc, skb, wmi->ctrl_epid); } int ath9k_wmi_cmd(struct wmi *wmi, enum wmi_cmd_id cmd_id, u8 *cmd_buf, u32 cmd_len, u8 *rsp_buf, u32 rsp_len, u32 timeout) { struct ath_hw *ah = wmi->drv_priv->ah; struct ath_common *common = ath9k_hw_common(ah); u16 headroom = sizeof(struct htc_frame_hdr) + sizeof(struct wmi_cmd_hdr); unsigned long time_left, flags; struct sk_buff *skb; int ret = 0; if (ah->ah_flags & AH_UNPLUGGED) return 0; skb = alloc_skb(headroom + cmd_len, GFP_ATOMIC); if (!skb) return -ENOMEM; skb_reserve(skb, headroom); if (cmd_len != 0 && cmd_buf != NULL) { skb_put_data(skb, cmd_buf, cmd_len); } mutex_lock(&wmi->op_mutex); /* check if wmi stopped flag is set */ if (unlikely(wmi->stopped)) { ret = -EPROTO; goto out; } ret = ath9k_wmi_cmd_issue(wmi, skb, cmd_id, cmd_len, rsp_buf, rsp_len); if (ret) goto out; time_left = wait_for_completion_timeout(&wmi->cmd_wait, timeout); if (!time_left) { ath_dbg(common, WMI, "Timeout waiting for WMI command: %s\n", wmi_cmd_to_name(cmd_id)); spin_lock_irqsave(&wmi->wmi_lock, flags); wmi->last_seq_id = 0; spin_unlock_irqrestore(&wmi->wmi_lock, flags); mutex_unlock(&wmi->op_mutex); return -ETIMEDOUT; } mutex_unlock(&wmi->op_mutex); return 0; out: ath_dbg(common, WMI, "WMI failure for: %s\n", wmi_cmd_to_name(cmd_id)); mutex_unlock(&wmi->op_mutex); kfree_skb(skb); return ret; } |
7 7 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2008 by Karsten Keil <kkeil@novell.com> */ #include <linux/slab.h> #include <linux/types.h> #include <linux/stddef.h> #include <linux/module.h> #include <linux/spinlock.h> #include <linux/mISDNif.h> #include "core.h" static u_int debug; MODULE_AUTHOR("Karsten Keil"); MODULE_LICENSE("GPL"); module_param(debug, uint, S_IRUGO | S_IWUSR); static u64 device_ids; #define MAX_DEVICE_ID 63 static LIST_HEAD(Bprotocols); static DEFINE_RWLOCK(bp_lock); static void mISDN_dev_release(struct device *dev) { /* nothing to do: the device is part of its parent's data structure */ } static ssize_t id_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mISDNdevice *mdev = dev_to_mISDN(dev); if (!mdev) return -ENODEV; return sprintf(buf, "%d\n", mdev->id); } static DEVICE_ATTR_RO(id); static ssize_t nrbchan_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mISDNdevice *mdev = dev_to_mISDN(dev); if (!mdev) return -ENODEV; return sprintf(buf, "%d\n", mdev->nrbchan); } static DEVICE_ATTR_RO(nrbchan); static ssize_t d_protocols_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mISDNdevice *mdev = dev_to_mISDN(dev); if (!mdev) return -ENODEV; return sprintf(buf, "%d\n", mdev->Dprotocols); } static DEVICE_ATTR_RO(d_protocols); static ssize_t b_protocols_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mISDNdevice *mdev = dev_to_mISDN(dev); if (!mdev) return -ENODEV; return sprintf(buf, "%d\n", mdev->Bprotocols | get_all_Bprotocols()); } static DEVICE_ATTR_RO(b_protocols); static ssize_t protocol_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mISDNdevice *mdev = dev_to_mISDN(dev); if (!mdev) return -ENODEV; return sprintf(buf, "%d\n", mdev->D.protocol); } static DEVICE_ATTR_RO(protocol); static ssize_t name_show(struct device *dev, struct device_attribute *attr, char *buf) { strcpy(buf, dev_name(dev)); return strlen(buf); } static DEVICE_ATTR_RO(name); #if 0 /* hangs */ static ssize_t name_set(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { int err = 0; char *out = kmalloc(count + 1, GFP_KERNEL); if (!out) return -ENOMEM; memcpy(out, buf, count); if (count && out[count - 1] == '\n') out[--count] = 0; if (count) err = device_rename(dev, out); kfree(out); return (err < 0) ? err : count; } static DEVICE_ATTR_RW(name); #endif static ssize_t channelmap_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mISDNdevice *mdev = dev_to_mISDN(dev); char *bp = buf; int i; for (i = 0; i <= mdev->nrbchan; i++) *bp++ = test_channelmap(i, mdev->channelmap) ? '1' : '0'; return bp - buf; } static DEVICE_ATTR_RO(channelmap); static struct attribute *mISDN_attrs[] = { &dev_attr_id.attr, &dev_attr_d_protocols.attr, &dev_attr_b_protocols.attr, &dev_attr_protocol.attr, &dev_attr_channelmap.attr, &dev_attr_nrbchan.attr, &dev_attr_name.attr, NULL, }; ATTRIBUTE_GROUPS(mISDN); static int mISDN_uevent(const struct device *dev, struct kobj_uevent_env *env) { const struct mISDNdevice *mdev = dev_to_mISDN(dev); if (!mdev) return 0; if (add_uevent_var(env, "nchans=%d", mdev->nrbchan)) return -ENOMEM; return 0; } static struct class mISDN_class = { .name = "mISDN", .dev_uevent = mISDN_uevent, .dev_groups = mISDN_groups, .dev_release = mISDN_dev_release, }; static int _get_mdevice(struct device *dev, const void *id) { struct mISDNdevice *mdev = dev_to_mISDN(dev); if (!mdev) return 0; if (mdev->id != *(const u_int *)id) return 0; return 1; } struct mISDNdevice *get_mdevice(u_int id) { return dev_to_mISDN(class_find_device(&mISDN_class, NULL, &id, _get_mdevice)); } static int _get_mdevice_count(struct device *dev, void *cnt) { *(int *)cnt += 1; return 0; } int get_mdevice_count(void) { int cnt = 0; class_for_each_device(&mISDN_class, NULL, &cnt, _get_mdevice_count); return cnt; } static int get_free_devid(void) { u_int i; for (i = 0; i <= MAX_DEVICE_ID; i++) if (!test_and_set_bit(i, (u_long *)&device_ids)) break; if (i > MAX_DEVICE_ID) return -EBUSY; return i; } int mISDN_register_device(struct mISDNdevice *dev, struct device *parent, char *name) { int err; err = get_free_devid(); if (err < 0) return err; dev->id = err; device_initialize(&dev->dev); if (name && name[0]) dev_set_name(&dev->dev, "%s", name); else dev_set_name(&dev->dev, "mISDN%d", dev->id); if (debug & DEBUG_CORE) printk(KERN_DEBUG "mISDN_register %s %d\n", dev_name(&dev->dev), dev->id); dev->dev.class = &mISDN_class; err = create_stack(dev); if (err) goto error1; dev->dev.platform_data = dev; dev->dev.parent = parent; dev_set_drvdata(&dev->dev, dev); err = device_add(&dev->dev); if (err) goto error3; return 0; error3: delete_stack(dev); error1: put_device(&dev->dev); return err; } EXPORT_SYMBOL(mISDN_register_device); void mISDN_unregister_device(struct mISDNdevice *dev) { if (debug & DEBUG_CORE) printk(KERN_DEBUG "mISDN_unregister %s %d\n", dev_name(&dev->dev), dev->id); /* sysfs_remove_link(&dev->dev.kobj, "device"); */ device_del(&dev->dev); dev_set_drvdata(&dev->dev, NULL); test_and_clear_bit(dev->id, (u_long *)&device_ids); delete_stack(dev); put_device(&dev->dev); } EXPORT_SYMBOL(mISDN_unregister_device); u_int get_all_Bprotocols(void) { struct Bprotocol *bp; u_int m = 0; read_lock(&bp_lock); list_for_each_entry(bp, &Bprotocols, list) m |= bp->Bprotocols; read_unlock(&bp_lock); return m; } struct Bprotocol * get_Bprotocol4mask(u_int m) { struct Bprotocol *bp; read_lock(&bp_lock); list_for_each_entry(bp, &Bprotocols, list) if (bp->Bprotocols & m) { read_unlock(&bp_lock); return bp; } read_unlock(&bp_lock); return NULL; } struct Bprotocol * get_Bprotocol4id(u_int id) { u_int m; if (id < ISDN_P_B_START || id > 63) { printk(KERN_WARNING "%s id not in range %d\n", __func__, id); return NULL; } m = 1 << (id & ISDN_P_B_MASK); return get_Bprotocol4mask(m); } int mISDN_register_Bprotocol(struct Bprotocol *bp) { u_long flags; struct Bprotocol *old; if (debug & DEBUG_CORE) printk(KERN_DEBUG "%s: %s/%x\n", __func__, bp->name, bp->Bprotocols); old = get_Bprotocol4mask(bp->Bprotocols); if (old) { printk(KERN_WARNING "register duplicate protocol old %s/%x new %s/%x\n", old->name, old->Bprotocols, bp->name, bp->Bprotocols); return -EBUSY; } write_lock_irqsave(&bp_lock, flags); list_add_tail(&bp->list, &Bprotocols); write_unlock_irqrestore(&bp_lock, flags); return 0; } EXPORT_SYMBOL(mISDN_register_Bprotocol); void mISDN_unregister_Bprotocol(struct Bprotocol *bp) { u_long flags; if (debug & DEBUG_CORE) printk(KERN_DEBUG "%s: %s/%x\n", __func__, bp->name, bp->Bprotocols); write_lock_irqsave(&bp_lock, flags); list_del(&bp->list); write_unlock_irqrestore(&bp_lock, flags); } EXPORT_SYMBOL(mISDN_unregister_Bprotocol); static const char *msg_no_channel = "<no channel>"; static const char *msg_no_stack = "<no stack>"; static const char *msg_no_stackdev = "<no stack device>"; const char *mISDNDevName4ch(struct mISDNchannel *ch) { if (!ch) return msg_no_channel; if (!ch->st) return msg_no_stack; if (!ch->st->dev) return msg_no_stackdev; return dev_name(&ch->st->dev->dev); }; EXPORT_SYMBOL(mISDNDevName4ch); static int mISDNInit(void) { int err; printk(KERN_INFO "Modular ISDN core version %d.%d.%d\n", MISDN_MAJOR_VERSION, MISDN_MINOR_VERSION, MISDN_RELEASE); mISDN_init_clock(&debug); mISDN_initstack(&debug); err = class_register(&mISDN_class); if (err) goto error1; err = mISDN_inittimer(&debug); if (err) goto error2; err = Isdnl1_Init(&debug); if (err) goto error3; err = Isdnl2_Init(&debug); if (err) goto error4; err = misdn_sock_init(&debug); if (err) goto error5; return 0; error5: Isdnl2_cleanup(); error4: Isdnl1_cleanup(); error3: mISDN_timer_cleanup(); error2: class_unregister(&mISDN_class); error1: return err; } static void mISDN_cleanup(void) { misdn_sock_cleanup(); Isdnl2_cleanup(); Isdnl1_cleanup(); mISDN_timer_cleanup(); class_unregister(&mISDN_class); printk(KERN_DEBUG "mISDNcore unloaded\n"); } module_init(mISDNInit); module_exit(mISDN_cleanup); |
221 53 177 223 6 84 247 6 221 223 222 256 204 5 43 216 13 2 2 77 247 74 43 41 290 1 7 19 263 2 263 58 33 202 95 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 | // SPDX-License-Identifier: GPL-2.0-or-later /* * linux/mm/process_vm_access.c * * Copyright (C) 2010-2011 Christopher Yeoh <cyeoh@au1.ibm.com>, IBM Corp. */ #include <linux/compat.h> #include <linux/mm.h> #include <linux/uio.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/highmem.h> #include <linux/ptrace.h> #include <linux/slab.h> #include <linux/syscalls.h> /** * process_vm_rw_pages - read/write pages from task specified * @pages: array of pointers to pages we want to copy * @offset: offset in page to start copying from/to * @len: number of bytes to copy * @iter: where to copy to/from locally * @vm_write: 0 means copy from, 1 means copy to * Returns 0 on success, error code otherwise */ static int process_vm_rw_pages(struct page **pages, unsigned offset, size_t len, struct iov_iter *iter, int vm_write) { /* Do the copy for each page */ while (len && iov_iter_count(iter)) { struct page *page = *pages++; size_t copy = PAGE_SIZE - offset; size_t copied; if (copy > len) copy = len; if (vm_write) copied = copy_page_from_iter(page, offset, copy, iter); else copied = copy_page_to_iter(page, offset, copy, iter); len -= copied; if (copied < copy && iov_iter_count(iter)) return -EFAULT; offset = 0; } return 0; } /* Maximum number of pages kmalloc'd to hold struct page's during copy */ #define PVM_MAX_KMALLOC_PAGES 2 /* Maximum number of pages that can be stored at a time */ #define PVM_MAX_USER_PAGES (PVM_MAX_KMALLOC_PAGES * PAGE_SIZE / sizeof(struct page *)) /** * process_vm_rw_single_vec - read/write pages from task specified * @addr: start memory address of target process * @len: size of area to copy to/from * @iter: where to copy to/from locally * @process_pages: struct pages area that can store at least * nr_pages_to_copy struct page pointers * @mm: mm for task * @task: task to read/write from * @vm_write: 0 means copy from, 1 means copy to * Returns 0 on success or on failure error code */ static int process_vm_rw_single_vec(unsigned long addr, unsigned long len, struct iov_iter *iter, struct page **process_pages, struct mm_struct *mm, struct task_struct *task, int vm_write) { unsigned long pa = addr & PAGE_MASK; unsigned long start_offset = addr - pa; unsigned long nr_pages; ssize_t rc = 0; unsigned int flags = 0; /* Work out address and page range required */ if (len == 0) return 0; nr_pages = (addr + len - 1) / PAGE_SIZE - addr / PAGE_SIZE + 1; if (vm_write) flags |= FOLL_WRITE; while (!rc && nr_pages && iov_iter_count(iter)) { int pinned_pages = min_t(unsigned long, nr_pages, PVM_MAX_USER_PAGES); int locked = 1; size_t bytes; /* * Get the pages we're interested in. We must * access remotely because task/mm might not * current/current->mm */ mmap_read_lock(mm); pinned_pages = pin_user_pages_remote(mm, pa, pinned_pages, flags, process_pages, &locked); if (locked) mmap_read_unlock(mm); if (pinned_pages <= 0) return -EFAULT; bytes = pinned_pages * PAGE_SIZE - start_offset; if (bytes > len) bytes = len; rc = process_vm_rw_pages(process_pages, start_offset, bytes, iter, vm_write); len -= bytes; start_offset = 0; nr_pages -= pinned_pages; pa += pinned_pages * PAGE_SIZE; /* If vm_write is set, the pages need to be made dirty: */ unpin_user_pages_dirty_lock(process_pages, pinned_pages, vm_write); } return rc; } /* Maximum number of entries for process pages array which lives on stack */ #define PVM_MAX_PP_ARRAY_COUNT 16 /** * process_vm_rw_core - core of reading/writing pages from task specified * @pid: PID of process to read/write from/to * @iter: where to copy to/from locally * @rvec: iovec array specifying where to copy to/from in the other process * @riovcnt: size of rvec array * @flags: currently unused * @vm_write: 0 if reading from other process, 1 if writing to other process * * Returns the number of bytes read/written or error code. May * return less bytes than expected if an error occurs during the copying * process. */ static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter, const struct iovec *rvec, unsigned long riovcnt, unsigned long flags, int vm_write) { struct task_struct *task; struct page *pp_stack[PVM_MAX_PP_ARRAY_COUNT]; struct page **process_pages = pp_stack; struct mm_struct *mm; unsigned long i; ssize_t rc = 0; unsigned long nr_pages = 0; unsigned long nr_pages_iov; ssize_t iov_len; size_t total_len = iov_iter_count(iter); /* * Work out how many pages of struct pages we're going to need * when eventually calling get_user_pages */ for (i = 0; i < riovcnt; i++) { iov_len = rvec[i].iov_len; if (iov_len > 0) { nr_pages_iov = ((unsigned long)rvec[i].iov_base + iov_len - 1) / PAGE_SIZE - (unsigned long)rvec[i].iov_base / PAGE_SIZE + 1; nr_pages = max(nr_pages, nr_pages_iov); } } if (nr_pages == 0) return 0; if (nr_pages > PVM_MAX_PP_ARRAY_COUNT) { /* For reliability don't try to kmalloc more than 2 pages worth */ process_pages = kmalloc(min_t(size_t, PVM_MAX_KMALLOC_PAGES * PAGE_SIZE, sizeof(struct page *)*nr_pages), GFP_KERNEL); if (!process_pages) return -ENOMEM; } /* Get process information */ task = find_get_task_by_vpid(pid); if (!task) { rc = -ESRCH; goto free_proc_pages; } mm = mm_access(task, PTRACE_MODE_ATTACH_REALCREDS); if (!mm || IS_ERR(mm)) { rc = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; /* * Explicitly map EACCES to EPERM as EPERM is a more * appropriate error code for process_vw_readv/writev */ if (rc == -EACCES) rc = -EPERM; goto put_task_struct; } for (i = 0; i < riovcnt && iov_iter_count(iter) && !rc; i++) rc = process_vm_rw_single_vec( (unsigned long)rvec[i].iov_base, rvec[i].iov_len, iter, process_pages, mm, task, vm_write); /* copied = space before - space after */ total_len -= iov_iter_count(iter); /* If we have managed to copy any data at all then we return the number of bytes copied. Otherwise we return the error code */ if (total_len) rc = total_len; mmput(mm); put_task_struct: put_task_struct(task); free_proc_pages: if (process_pages != pp_stack) kfree(process_pages); return rc; } /** * process_vm_rw - check iovecs before calling core routine * @pid: PID of process to read/write from/to * @lvec: iovec array specifying where to copy to/from locally * @liovcnt: size of lvec array * @rvec: iovec array specifying where to copy to/from in the other process * @riovcnt: size of rvec array * @flags: currently unused * @vm_write: 0 if reading from other process, 1 if writing to other process * * Returns the number of bytes read/written or error code. May * return less bytes than expected if an error occurs during the copying * process. */ static ssize_t process_vm_rw(pid_t pid, const struct iovec __user *lvec, unsigned long liovcnt, const struct iovec __user *rvec, unsigned long riovcnt, unsigned long flags, int vm_write) { struct iovec iovstack_l[UIO_FASTIOV]; struct iovec iovstack_r[UIO_FASTIOV]; struct iovec *iov_l = iovstack_l; struct iovec *iov_r; struct iov_iter iter; ssize_t rc; int dir = vm_write ? ITER_SOURCE : ITER_DEST; if (flags != 0) return -EINVAL; /* Check iovecs */ rc = import_iovec(dir, lvec, liovcnt, UIO_FASTIOV, &iov_l, &iter); if (rc < 0) return rc; if (!iov_iter_count(&iter)) goto free_iov_l; iov_r = iovec_from_user(rvec, riovcnt, UIO_FASTIOV, iovstack_r, in_compat_syscall()); if (IS_ERR(iov_r)) { rc = PTR_ERR(iov_r); goto free_iov_l; } rc = process_vm_rw_core(pid, &iter, iov_r, riovcnt, flags, vm_write); if (iov_r != iovstack_r) kfree(iov_r); free_iov_l: kfree(iov_l); return rc; } SYSCALL_DEFINE6(process_vm_readv, pid_t, pid, const struct iovec __user *, lvec, unsigned long, liovcnt, const struct iovec __user *, rvec, unsigned long, riovcnt, unsigned long, flags) { return process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 0); } SYSCALL_DEFINE6(process_vm_writev, pid_t, pid, const struct iovec __user *, lvec, unsigned long, liovcnt, const struct iovec __user *, rvec, unsigned long, riovcnt, unsigned long, flags) { return process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 1); } |
840 1 1 101 101 101 101 101 101 74 5 5 5 5 4 6 6 6 6 6 6 5 2 1 4 4 4 4 6 1 1 5517 5515 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 9 2 3 2 2 1 1 1 94 5 5 5 4 2 2 66 65 66 66 1 4 3 1 1 3 3 3 1 2 2 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 4 3 1 3 1 1 1 19 18 13 6 3 1 2 1 1 1 1 16 16 16 16 16 16 16 16 16 16 16 623 623 623 623 623 18 18 18 18 18 609 609 609 609 609 609 96 96 96 96 96 68 69 69 69 13 13939 584 13766 2 2 2 2 2 2 2 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 | /* * kernel/cpuset.c * * Processor and Memory placement constraints for sets of tasks. * * Copyright (C) 2003 BULL SA. * Copyright (C) 2004-2007 Silicon Graphics, Inc. * Copyright (C) 2006 Google, Inc * * Portions derived from Patrick Mochel's sysfs code. * sysfs is Copyright (c) 2001-3 Patrick Mochel * * 2003-10-10 Written by Simon Derr. * 2003-10-22 Updates by Stephen Hemminger. * 2004 May-July Rework by Paul Jackson. * 2006 Rework by Paul Menage to use generic cgroups * 2008 Rework of the scheduler domains and CPU hotplug handling * by Max Krasnyansky * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of the Linux * distribution for more details. */ #include <linux/cpu.h> #include <linux/cpumask.h> #include <linux/cpuset.h> #include <linux/delay.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/kernel.h> #include <linux/mempolicy.h> #include <linux/mm.h> #include <linux/memory.h> #include <linux/export.h> #include <linux/rcupdate.h> #include <linux/sched.h> #include <linux/sched/deadline.h> #include <linux/sched/mm.h> #include <linux/sched/task.h> #include <linux/security.h> #include <linux/spinlock.h> #include <linux/oom.h> #include <linux/sched/isolation.h> #include <linux/cgroup.h> #include <linux/wait.h> #include <linux/workqueue.h> DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key); DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); /* * There could be abnormal cpuset configurations for cpu or memory * node binding, add this key to provide a quick low-cost judgment * of the situation. */ DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key); /* See "Frequency meter" comments, below. */ struct fmeter { int cnt; /* unprocessed events count */ int val; /* most recent output value */ time64_t time; /* clock (secs) when val computed */ spinlock_t lock; /* guards read or write of above */ }; /* * Invalid partition error code */ enum prs_errcode { PERR_NONE = 0, PERR_INVCPUS, PERR_INVPARENT, PERR_NOTPART, PERR_NOTEXCL, PERR_NOCPUS, PERR_HOTPLUG, PERR_CPUSEMPTY, PERR_HKEEPING, }; static const char * const perr_strings[] = { [PERR_INVCPUS] = "Invalid cpu list in cpuset.cpus.exclusive", [PERR_INVPARENT] = "Parent is an invalid partition root", [PERR_NOTPART] = "Parent is not a partition root", [PERR_NOTEXCL] = "Cpu list in cpuset.cpus not exclusive", [PERR_NOCPUS] = "Parent unable to distribute cpu downstream", [PERR_HOTPLUG] = "No cpu available due to hotplug", [PERR_CPUSEMPTY] = "cpuset.cpus is empty", [PERR_HKEEPING] = "partition config conflicts with housekeeping setup", }; struct cpuset { struct cgroup_subsys_state css; unsigned long flags; /* "unsigned long" so bitops work */ /* * On default hierarchy: * * The user-configured masks can only be changed by writing to * cpuset.cpus and cpuset.mems, and won't be limited by the * parent masks. * * The effective masks is the real masks that apply to the tasks * in the cpuset. They may be changed if the configured masks are * changed or hotplug happens. * * effective_mask == configured_mask & parent's effective_mask, * and if it ends up empty, it will inherit the parent's mask. * * * On legacy hierarchy: * * The user-configured masks are always the same with effective masks. */ /* user-configured CPUs and Memory Nodes allow to tasks */ cpumask_var_t cpus_allowed; nodemask_t mems_allowed; /* effective CPUs and Memory Nodes allow to tasks */ cpumask_var_t effective_cpus; nodemask_t effective_mems; /* * Exclusive CPUs dedicated to current cgroup (default hierarchy only) * * This exclusive CPUs must be a subset of cpus_allowed. A parent * cgroup can only grant exclusive CPUs to one of its children. * * When the cgroup becomes a valid partition root, effective_xcpus * defaults to cpus_allowed if not set. The effective_cpus of a valid * partition root comes solely from its effective_xcpus and some of the * effective_xcpus may be distributed to sub-partitions below & hence * excluded from its effective_cpus. */ cpumask_var_t effective_xcpus; /* * Exclusive CPUs as requested by the user (default hierarchy only) */ cpumask_var_t exclusive_cpus; /* * This is old Memory Nodes tasks took on. * * - top_cpuset.old_mems_allowed is initialized to mems_allowed. * - A new cpuset's old_mems_allowed is initialized when some * task is moved into it. * - old_mems_allowed is used in cpuset_migrate_mm() when we change * cpuset.mems_allowed and have tasks' nodemask updated, and * then old_mems_allowed is updated to mems_allowed. */ nodemask_t old_mems_allowed; struct fmeter fmeter; /* memory_pressure filter */ /* * Tasks are being attached to this cpuset. Used to prevent * zeroing cpus/mems_allowed between ->can_attach() and ->attach(). */ int attach_in_progress; /* partition number for rebuild_sched_domains() */ int pn; /* for custom sched domain */ int relax_domain_level; /* number of valid sub-partitions */ int nr_subparts; /* partition root state */ int partition_root_state; /* * Default hierarchy only: * use_parent_ecpus - set if using parent's effective_cpus * child_ecpus_count - # of children with use_parent_ecpus set */ int use_parent_ecpus; int child_ecpus_count; /* * number of SCHED_DEADLINE tasks attached to this cpuset, so that we * know when to rebuild associated root domain bandwidth information. */ int nr_deadline_tasks; int nr_migrate_dl_tasks; u64 sum_migrate_dl_bw; /* Invalid partition error code, not lock protected */ enum prs_errcode prs_err; /* Handle for cpuset.cpus.partition */ struct cgroup_file partition_file; /* Remote partition silbling list anchored at remote_children */ struct list_head remote_sibling; }; /* * Exclusive CPUs distributed out to sub-partitions of top_cpuset */ static cpumask_var_t subpartitions_cpus; /* * Exclusive CPUs in isolated partitions */ static cpumask_var_t isolated_cpus; /* List of remote partition root children */ static struct list_head remote_children; /* * Partition root states: * * 0 - member (not a partition root) * 1 - partition root * 2 - partition root without load balancing (isolated) * -1 - invalid partition root * -2 - invalid isolated partition root */ #define PRS_MEMBER 0 #define PRS_ROOT 1 #define PRS_ISOLATED 2 #define PRS_INVALID_ROOT -1 #define PRS_INVALID_ISOLATED -2 static inline bool is_prs_invalid(int prs_state) { return prs_state < 0; } /* * Temporary cpumasks for working with partitions that are passed among * functions to avoid memory allocation in inner functions. */ struct tmpmasks { cpumask_var_t addmask, delmask; /* For partition root */ cpumask_var_t new_cpus; /* For update_cpumasks_hier() */ }; static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) { return css ? container_of(css, struct cpuset, css) : NULL; } /* Retrieve the cpuset for a task */ static inline struct cpuset *task_cs(struct task_struct *task) { return css_cs(task_css(task, cpuset_cgrp_id)); } static inline struct cpuset *parent_cs(struct cpuset *cs) { return css_cs(cs->css.parent); } void inc_dl_tasks_cs(struct task_struct *p) { struct cpuset *cs = task_cs(p); cs->nr_deadline_tasks++; } void dec_dl_tasks_cs(struct task_struct *p) { struct cpuset *cs = task_cs(p); cs->nr_deadline_tasks--; } /* bits in struct cpuset flags field */ typedef enum { CS_ONLINE, CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, CS_MEM_HARDWALL, CS_MEMORY_MIGRATE, CS_SCHED_LOAD_BALANCE, CS_SPREAD_PAGE, CS_SPREAD_SLAB, } cpuset_flagbits_t; /* convenient tests for these bits */ static inline bool is_cpuset_online(struct cpuset *cs) { return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css); } static inline int is_cpu_exclusive(const struct cpuset *cs) { return test_bit(CS_CPU_EXCLUSIVE, &cs->flags); } static inline int is_mem_exclusive(const struct cpuset *cs) { return test_bit(CS_MEM_EXCLUSIVE, &cs->flags); } static inline int is_mem_hardwall(const struct cpuset *cs) { return test_bit(CS_MEM_HARDWALL, &cs->flags); } static inline int is_sched_load_balance(const struct cpuset *cs) { return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); } static inline int is_memory_migrate(const struct cpuset *cs) { return test_bit(CS_MEMORY_MIGRATE, &cs->flags); } static inline int is_spread_page(const struct cpuset *cs) { return test_bit(CS_SPREAD_PAGE, &cs->flags); } static inline int is_spread_slab(const struct cpuset *cs) { return test_bit(CS_SPREAD_SLAB, &cs->flags); } static inline int is_partition_valid(const struct cpuset *cs) { return cs->partition_root_state > 0; } static inline int is_partition_invalid(const struct cpuset *cs) { return cs->partition_root_state < 0; } /* * Callers should hold callback_lock to modify partition_root_state. */ static inline void make_partition_invalid(struct cpuset *cs) { if (cs->partition_root_state > 0) cs->partition_root_state = -cs->partition_root_state; } /* * Send notification event of whenever partition_root_state changes. */ static inline void notify_partition_change(struct cpuset *cs, int old_prs) { if (old_prs == cs->partition_root_state) return; cgroup_file_notify(&cs->partition_file); /* Reset prs_err if not invalid */ if (is_partition_valid(cs)) WRITE_ONCE(cs->prs_err, PERR_NONE); } static struct cpuset top_cpuset = { .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), .partition_root_state = PRS_ROOT, .remote_sibling = LIST_HEAD_INIT(top_cpuset.remote_sibling), }; /** * cpuset_for_each_child - traverse online children of a cpuset * @child_cs: loop cursor pointing to the current child * @pos_css: used for iteration * @parent_cs: target cpuset to walk children of * * Walk @child_cs through the online children of @parent_cs. Must be used * with RCU read locked. */ #define cpuset_for_each_child(child_cs, pos_css, parent_cs) \ css_for_each_child((pos_css), &(parent_cs)->css) \ if (is_cpuset_online(((child_cs) = css_cs((pos_css))))) /** * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants * @des_cs: loop cursor pointing to the current descendant * @pos_css: used for iteration * @root_cs: target cpuset to walk ancestor of * * Walk @des_cs through the online descendants of @root_cs. Must be used * with RCU read locked. The caller may modify @pos_css by calling * css_rightmost_descendant() to skip subtree. @root_cs is included in the * iteration and the first node to be visited. */ #define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \ css_for_each_descendant_pre((pos_css), &(root_cs)->css) \ if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) /* * There are two global locks guarding cpuset structures - cpuset_mutex and * callback_lock. We also require taking task_lock() when dereferencing a * task's cpuset pointer. See "The task_lock() exception", at the end of this * comment. The cpuset code uses only cpuset_mutex. Other kernel subsystems * can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset * structures. Note that cpuset_mutex needs to be a mutex as it is used in * paths that rely on priority inheritance (e.g. scheduler - on RT) for * correctness. * * A task must hold both locks to modify cpusets. If a task holds * cpuset_mutex, it blocks others, ensuring that it is the only task able to * also acquire callback_lock and be able to modify cpusets. It can perform * various checks on the cpuset structure first, knowing nothing will change. * It can also allocate memory while just holding cpuset_mutex. While it is * performing these checks, various callback routines can briefly acquire * callback_lock to query cpusets. Once it is ready to make the changes, it * takes callback_lock, blocking everyone else. * * Calls to the kernel memory allocator can not be made while holding * callback_lock, as that would risk double tripping on callback_lock * from one of the callbacks into the cpuset code from within * __alloc_pages(). * * If a task is only holding callback_lock, then it has read-only * access to cpusets. * * Now, the task_struct fields mems_allowed and mempolicy may be changed * by other task, we use alloc_lock in the task_struct fields to protect * them. * * The cpuset_common_file_read() handlers only hold callback_lock across * small pieces of code, such as when reading out possibly multi-word * cpumasks and nodemasks. * * Accessing a task's cpuset should be done in accordance with the * guidelines for accessing subsystem state in kernel/cgroup.c */ static DEFINE_MUTEX(cpuset_mutex); void cpuset_lock(void) { mutex_lock(&cpuset_mutex); } void cpuset_unlock(void) { mutex_unlock(&cpuset_mutex); } static DEFINE_SPINLOCK(callback_lock); static struct workqueue_struct *cpuset_migrate_mm_wq; /* * CPU / memory hotplug is handled asynchronously. */ static void cpuset_hotplug_workfn(struct work_struct *work); static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); static inline void check_insane_mems_config(nodemask_t *nodes) { if (!cpusets_insane_config() && movable_only_nodes(nodes)) { static_branch_enable(&cpusets_insane_config_key); pr_info("Unsupported (movable nodes only) cpuset configuration detected (nmask=%*pbl)!\n" "Cpuset allocations might fail even with a lot of memory available.\n", nodemask_pr_args(nodes)); } } /* * Cgroup v2 behavior is used on the "cpus" and "mems" control files when * on default hierarchy or when the cpuset_v2_mode flag is set by mounting * the v1 cpuset cgroup filesystem with the "cpuset_v2_mode" mount option. * With v2 behavior, "cpus" and "mems" are always what the users have * requested and won't be changed by hotplug events. Only the effective * cpus or mems will be affected. */ static inline bool is_in_v2_mode(void) { return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE); } /** * partition_is_populated - check if partition has tasks * @cs: partition root to be checked * @excluded_child: a child cpuset to be excluded in task checking * Return: true if there are tasks, false otherwise * * It is assumed that @cs is a valid partition root. @excluded_child should * be non-NULL when this cpuset is going to become a partition itself. */ static inline bool partition_is_populated(struct cpuset *cs, struct cpuset *excluded_child) { struct cgroup_subsys_state *css; struct cpuset *child; if (cs->css.cgroup->nr_populated_csets) return true; if (!excluded_child && !cs->nr_subparts) return cgroup_is_populated(cs->css.cgroup); rcu_read_lock(); cpuset_for_each_child(child, css, cs) { if (child == excluded_child) continue; if (is_partition_valid(child)) continue; if (cgroup_is_populated(child->css.cgroup)) { rcu_read_unlock(); return true; } } rcu_read_unlock(); return false; } /* * Return in pmask the portion of a task's cpusets's cpus_allowed that * are online and are capable of running the task. If none are found, * walk up the cpuset hierarchy until we find one that does have some * appropriate cpus. * * One way or another, we guarantee to return some non-empty subset * of cpu_online_mask. * * Call with callback_lock or cpuset_mutex held. */ static void guarantee_online_cpus(struct task_struct *tsk, struct cpumask *pmask) { const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); struct cpuset *cs; if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_online_mask))) cpumask_copy(pmask, cpu_online_mask); rcu_read_lock(); cs = task_cs(tsk); while (!cpumask_intersects(cs->effective_cpus, pmask)) { cs = parent_cs(cs); if (unlikely(!cs)) { /* * The top cpuset doesn't have any online cpu as a * consequence of a race between cpuset_hotplug_work * and cpu hotplug notifier. But we know the top * cpuset's effective_cpus is on its way to be * identical to cpu_online_mask. */ goto out_unlock; } } cpumask_and(pmask, pmask, cs->effective_cpus); out_unlock: rcu_read_unlock(); } /* * Return in *pmask the portion of a cpusets's mems_allowed that * are online, with memory. If none are online with memory, walk * up the cpuset hierarchy until we find one that does have some * online mems. The top cpuset always has some mems online. * * One way or another, we guarantee to return some non-empty subset * of node_states[N_MEMORY]. * * Call with callback_lock or cpuset_mutex held. */ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) { while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY])) cs = parent_cs(cs); nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]); } /* * update task's spread flag if cpuset's page/slab spread flag is set * * Call with callback_lock or cpuset_mutex held. The check can be skipped * if on default hierarchy. */ static void cpuset_update_task_spread_flags(struct cpuset *cs, struct task_struct *tsk) { if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) return; if (is_spread_page(cs)) task_set_spread_page(tsk); else task_clear_spread_page(tsk); if (is_spread_slab(cs)) task_set_spread_slab(tsk); else task_clear_spread_slab(tsk); } /* * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q? * * One cpuset is a subset of another if all its allowed CPUs and * Memory Nodes are a subset of the other, and its exclusive flags * are only set if the other's are set. Call holding cpuset_mutex. */ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) { return cpumask_subset(p->cpus_allowed, q->cpus_allowed) && nodes_subset(p->mems_allowed, q->mems_allowed) && is_cpu_exclusive(p) <= is_cpu_exclusive(q) && is_mem_exclusive(p) <= is_mem_exclusive(q); } /** * alloc_cpumasks - allocate three cpumasks for cpuset * @cs: the cpuset that have cpumasks to be allocated. * @tmp: the tmpmasks structure pointer * Return: 0 if successful, -ENOMEM otherwise. * * Only one of the two input arguments should be non-NULL. */ static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) { cpumask_var_t *pmask1, *pmask2, *pmask3, *pmask4; if (cs) { pmask1 = &cs->cpus_allowed; pmask2 = &cs->effective_cpus; pmask3 = &cs->effective_xcpus; pmask4 = &cs->exclusive_cpus; } else { pmask1 = &tmp->new_cpus; pmask2 = &tmp->addmask; pmask3 = &tmp->delmask; pmask4 = NULL; } if (!zalloc_cpumask_var(pmask1, GFP_KERNEL)) return -ENOMEM; if (!zalloc_cpumask_var(pmask2, GFP_KERNEL)) goto free_one; if (!zalloc_cpumask_var(pmask3, GFP_KERNEL)) goto free_two; if (pmask4 && !zalloc_cpumask_var(pmask4, GFP_KERNEL)) goto free_three; return 0; free_three: free_cpumask_var(*pmask3); free_two: free_cpumask_var(*pmask2); free_one: free_cpumask_var(*pmask1); return -ENOMEM; } /** * free_cpumasks - free cpumasks in a tmpmasks structure * @cs: the cpuset that have cpumasks to be free. * @tmp: the tmpmasks structure pointer */ static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) { if (cs) { free_cpumask_var(cs->cpus_allowed); free_cpumask_var(cs->effective_cpus); free_cpumask_var(cs->effective_xcpus); free_cpumask_var(cs->exclusive_cpus); } if (tmp) { free_cpumask_var(tmp->new_cpus); free_cpumask_var(tmp->addmask); free_cpumask_var(tmp->delmask); } } /** * alloc_trial_cpuset - allocate a trial cpuset * @cs: the cpuset that the trial cpuset duplicates */ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) { struct cpuset *trial; trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL); if (!trial) return NULL; if (alloc_cpumasks(trial, NULL)) { kfree(trial); return NULL; } cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); cpumask_copy(trial->effective_cpus, cs->effective_cpus); cpumask_copy(trial->effective_xcpus, cs->effective_xcpus); cpumask_copy(trial->exclusive_cpus, cs->exclusive_cpus); return trial; } /** * free_cpuset - free the cpuset * @cs: the cpuset to be freed */ static inline void free_cpuset(struct cpuset *cs) { free_cpumasks(cs, NULL); kfree(cs); } static inline struct cpumask *fetch_xcpus(struct cpuset *cs) { return !cpumask_empty(cs->exclusive_cpus) ? cs->exclusive_cpus : cpumask_empty(cs->effective_xcpus) ? cs->cpus_allowed : cs->effective_xcpus; } /* * cpusets_are_exclusive() - check if two cpusets are exclusive * * Return true if exclusive, false if not */ static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2) { struct cpumask *xcpus1 = fetch_xcpus(cs1); struct cpumask *xcpus2 = fetch_xcpus(cs2); if (cpumask_intersects(xcpus1, xcpus2)) return false; return true; } /* * validate_change_legacy() - Validate conditions specific to legacy (v1) * behavior. */ static int validate_change_legacy(struct cpuset *cur, struct cpuset *trial) { struct cgroup_subsys_state *css; struct cpuset *c, *par; int ret; WARN_ON_ONCE(!rcu_read_lock_held()); /* Each of our child cpusets must be a subset of us */ ret = -EBUSY; cpuset_for_each_child(c, css, cur) if (!is_cpuset_subset(c, trial)) goto out; /* On legacy hierarchy, we must be a subset of our parent cpuset. */ ret = -EACCES; par = parent_cs(cur); if (par && !is_cpuset_subset(trial, par)) goto out; ret = 0; out: return ret; } /* * validate_change() - Used to validate that any proposed cpuset change * follows the structural rules for cpusets. * * If we replaced the flag and mask values of the current cpuset * (cur) with those values in the trial cpuset (trial), would * our various subset and exclusive rules still be valid? Presumes * cpuset_mutex held. * * 'cur' is the address of an actual, in-use cpuset. Operations * such as list traversal that depend on the actual address of the * cpuset in the list must use cur below, not trial. * * 'trial' is the address of bulk structure copy of cur, with * perhaps one or more of the fields cpus_allowed, mems_allowed, * or flags changed to new, trial values. * * Return 0 if valid, -errno if not. */ static int validate_change(struct cpuset *cur, struct cpuset *trial) { struct cgroup_subsys_state *css; struct cpuset *c, *par; int ret = 0; rcu_read_lock(); if (!is_in_v2_mode()) ret = validate_change_legacy(cur, trial); if (ret) goto out; /* Remaining checks don't apply to root cpuset */ if (cur == &top_cpuset) goto out; par = parent_cs(cur); /* * Cpusets with tasks - existing or newly being attached - can't * be changed to have empty cpus_allowed or mems_allowed. */ ret = -ENOSPC; if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) { if (!cpumask_empty(cur->cpus_allowed) && cpumask_empty(trial->cpus_allowed)) goto out; if (!nodes_empty(cur->mems_allowed) && nodes_empty(trial->mems_allowed)) goto out; } /* * We can't shrink if we won't have enough room for SCHED_DEADLINE * tasks. */ ret = -EBUSY; if (is_cpu_exclusive(cur) && !cpuset_cpumask_can_shrink(cur->cpus_allowed, trial->cpus_allowed)) goto out; /* * If either I or some sibling (!= me) is exclusive, we can't * overlap */ ret = -EINVAL; cpuset_for_each_child(c, css, par) { if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && c != cur) { if (!cpusets_are_exclusive(trial, c)) goto out; } if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && c != cur && nodes_intersects(trial->mems_allowed, c->mems_allowed)) goto out; } ret = 0; out: rcu_read_unlock(); return ret; } #ifdef CONFIG_SMP /* * Helper routine for generate_sched_domains(). * Do cpusets a, b have overlapping effective cpus_allowed masks? */ static int cpusets_overlap(struct cpuset *a, struct cpuset *b) { return cpumask_intersects(a->effective_cpus, b->effective_cpus); } static void update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) { if (dattr->relax_domain_level < c->relax_domain_level) dattr->relax_domain_level = c->relax_domain_level; return; } static void update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *root_cs) { struct cpuset *cp; struct cgroup_subsys_state *pos_css; rcu_read_lock(); cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { /* skip the whole subtree if @cp doesn't have any CPU */ if (cpumask_empty(cp->cpus_allowed)) { pos_css = css_rightmost_descendant(pos_css); continue; } if (is_sched_load_balance(cp)) update_domain_attr(dattr, cp); } rcu_read_unlock(); } /* Must be called with cpuset_mutex held. */ static inline int nr_cpusets(void) { /* jump label reference count + the top-level cpuset */ return static_key_count(&cpusets_enabled_key.key) + 1; } /* * generate_sched_domains() * * This function builds a partial partition of the systems CPUs * A 'partial partition' is a set of non-overlapping subsets whose * union is a subset of that set. * The output of this function needs to be passed to kernel/sched/core.c * partition_sched_domains() routine, which will rebuild the scheduler's * load balancing domains (sched domains) as specified by that partial * partition. * * See "What is sched_load_balance" in Documentation/admin-guide/cgroup-v1/cpusets.rst * for a background explanation of this. * * Does not return errors, on the theory that the callers of this * routine would rather not worry about failures to rebuild sched * domains when operating in the severe memory shortage situations * that could cause allocation failures below. * * Must be called with cpuset_mutex held. * * The three key local variables below are: * cp - cpuset pointer, used (together with pos_css) to perform a * top-down scan of all cpusets. For our purposes, rebuilding * the schedulers sched domains, we can ignore !is_sched_load_ * balance cpusets. * csa - (for CpuSet Array) Array of pointers to all the cpusets * that need to be load balanced, for convenient iterative * access by the subsequent code that finds the best partition, * i.e the set of domains (subsets) of CPUs such that the * cpus_allowed of every cpuset marked is_sched_load_balance * is a subset of one of these domains, while there are as * many such domains as possible, each as small as possible. * doms - Conversion of 'csa' to an array of cpumasks, for passing to * the kernel/sched/core.c routine partition_sched_domains() in a * convenient format, that can be easily compared to the prior * value to determine what partition elements (sched domains) * were changed (added or removed.) * * Finding the best partition (set of domains): * The triple nested loops below over i, j, k scan over the * load balanced cpusets (using the array of cpuset pointers in * csa[]) looking for pairs of cpusets that have overlapping * cpus_allowed, but which don't have the same 'pn' partition * number and gives them in the same partition number. It keeps * looping on the 'restart' label until it can no longer find * any such pairs. * * The union of the cpus_allowed masks from the set of * all cpusets having the same 'pn' value then form the one * element of the partition (one sched domain) to be passed to * partition_sched_domains(). */ static int generate_sched_domains(cpumask_var_t **domains, struct sched_domain_attr **attributes) { struct cpuset *cp; /* top-down scan of cpusets */ struct cpuset **csa; /* array of all cpuset ptrs */ int csn; /* how many cpuset ptrs in csa so far */ int i, j, k; /* indices for partition finding loops */ cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ struct sched_domain_attr *dattr; /* attributes for custom domains */ int ndoms = 0; /* number of sched domains in result */ int nslot; /* next empty doms[] struct cpumask slot */ struct cgroup_subsys_state *pos_css; bool root_load_balance = is_sched_load_balance(&top_cpuset); doms = NULL; dattr = NULL; csa = NULL; /* Special case for the 99% of systems with one, full, sched domain */ if (root_load_balance && !top_cpuset.nr_subparts) { ndoms = 1; doms = alloc_sched_domains(ndoms); if (!doms) goto done; dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); if (dattr) { *dattr = SD_ATTR_INIT; update_domain_attr_tree(dattr, &top_cpuset); } cpumask_and(doms[0], top_cpuset.effective_cpus, housekeeping_cpumask(HK_TYPE_DOMAIN)); goto done; } csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL); if (!csa) goto done; csn = 0; rcu_read_lock(); if (root_load_balance) csa[csn++] = &top_cpuset; cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) { if (cp == &top_cpuset) continue; /* * Continue traversing beyond @cp iff @cp has some CPUs and * isn't load balancing. The former is obvious. The * latter: All child cpusets contain a subset of the * parent's cpus, so just skip them, and then we call * update_domain_attr_tree() to calc relax_domain_level of * the corresponding sched domain. * * If root is load-balancing, we can skip @cp if it * is a subset of the root's effective_cpus. */ if (!cpumask_empty(cp->cpus_allowed) && !(is_sched_load_balance(cp) && cpumask_intersects(cp->cpus_allowed, housekeeping_cpumask(HK_TYPE_DOMAIN)))) continue; if (root_load_balance && cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus)) continue; if (is_sched_load_balance(cp) && !cpumask_empty(cp->effective_cpus)) csa[csn++] = cp; /* skip @cp's subtree if not a partition root */ if (!is_partition_valid(cp)) pos_css = css_rightmost_descendant(pos_css); } rcu_read_unlock(); for (i = 0; i < csn; i++) csa[i]->pn = i; ndoms = csn; restart: /* Find the best partition (set of sched domains) */ for (i = 0; i < csn; i++) { struct cpuset *a = csa[i]; int apn = a->pn; for (j = 0; j < csn; j++) { struct cpuset *b = csa[j]; int bpn = b->pn; if (apn != bpn && cpusets_overlap(a, b)) { for (k = 0; k < csn; k++) { struct cpuset *c = csa[k]; if (c->pn == bpn) c->pn = apn; } ndoms--; /* one less element */ goto restart; } } } /* * Now we know how many domains to create. * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. */ doms = alloc_sched_domains(ndoms); if (!doms) goto done; /* * The rest of the code, including the scheduler, can deal with * dattr==NULL case. No need to abort if alloc fails. */ dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr), GFP_KERNEL); for (nslot = 0, i = 0; i < csn; i++) { struct cpuset *a = csa[i]; struct cpumask *dp; int apn = a->pn; if (apn < 0) { /* Skip completed partitions */ continue; } dp = doms[nslot]; if (nslot == ndoms) { static int warnings = 10; if (warnings) { pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n", nslot, ndoms, csn, i, apn); warnings--; } continue; } cpumask_clear(dp); if (dattr) *(dattr + nslot) = SD_ATTR_INIT; for (j = i; j < csn; j++) { struct cpuset *b = csa[j]; if (apn == b->pn) { cpumask_or(dp, dp, b->effective_cpus); cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN)); if (dattr) update_domain_attr_tree(dattr + nslot, b); /* Done with this partition */ b->pn = -1; } } nslot++; } BUG_ON(nslot != ndoms); done: kfree(csa); /* * Fallback to the default domain if kmalloc() failed. * See comments in partition_sched_domains(). */ if (doms == NULL) ndoms = 1; *domains = doms; *attributes = dattr; return ndoms; } static void dl_update_tasks_root_domain(struct cpuset *cs) { struct css_task_iter it; struct task_struct *task; if (cs->nr_deadline_tasks == 0) return; css_task_iter_start(&cs->css, 0, &it); while ((task = css_task_iter_next(&it))) dl_add_task_root_domain(task); css_task_iter_end(&it); } static void dl_rebuild_rd_accounting(void) { struct cpuset *cs = NULL; struct cgroup_subsys_state *pos_css; lockdep_assert_held(&cpuset_mutex); lockdep_assert_cpus_held(); lockdep_assert_held(&sched_domains_mutex); rcu_read_lock(); /* * Clear default root domain DL accounting, it will be computed again * if a task belongs to it. */ dl_clear_root_domain(&def_root_domain); cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { if (cpumask_empty(cs->effective_cpus)) { pos_css = css_rightmost_descendant(pos_css); continue; } css_get(&cs->css); rcu_read_unlock(); dl_update_tasks_root_domain(cs); rcu_read_lock(); css_put(&cs->css); } rcu_read_unlock(); } static void partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[], struct sched_domain_attr *dattr_new) { mutex_lock(&sched_domains_mutex); partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); dl_rebuild_rd_accounting(); mutex_unlock(&sched_domains_mutex); } /* * Rebuild scheduler domains. * * If the flag 'sched_load_balance' of any cpuset with non-empty * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset * which has that flag enabled, or if any cpuset with a non-empty * 'cpus' is removed, then call this routine to rebuild the * scheduler's dynamic sched domains. * * Call with cpuset_mutex held. Takes cpus_read_lock(). */ static void rebuild_sched_domains_locked(void) { struct cgroup_subsys_state *pos_css; struct sched_domain_attr *attr; cpumask_var_t *doms; struct cpuset *cs; int ndoms; lockdep_assert_cpus_held(); lockdep_assert_held(&cpuset_mutex); /* * If we have raced with CPU hotplug, return early to avoid * passing doms with offlined cpu to partition_sched_domains(). * Anyways, cpuset_hotplug_workfn() will rebuild sched domains. * * With no CPUs in any subpartitions, top_cpuset's effective CPUs * should be the same as the active CPUs, so checking only top_cpuset * is enough to detect racing CPU offlines. */ if (cpumask_empty(subpartitions_cpus) && !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) return; /* * With subpartition CPUs, however, the effective CPUs of a partition * root should be only a subset of the active CPUs. Since a CPU in any * partition root could be offlined, all must be checked. */ if (top_cpuset.nr_subparts) { rcu_read_lock(); cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { if (!is_partition_valid(cs)) { pos_css = css_rightmost_descendant(pos_css); continue; } if (!cpumask_subset(cs->effective_cpus, cpu_active_mask)) { rcu_read_unlock(); return; } } rcu_read_unlock(); } /* Generate domain masks and attrs */ ndoms = generate_sched_domains(&doms, &attr); /* Have scheduler rebuild the domains */ partition_and_rebuild_sched_domains(ndoms, doms, attr); } #else /* !CONFIG_SMP */ static void rebuild_sched_domains_locked(void) { } #endif /* CONFIG_SMP */ void rebuild_sched_domains(void) { cpus_read_lock(); mutex_lock(&cpuset_mutex); rebuild_sched_domains_locked(); mutex_unlock(&cpuset_mutex); cpus_read_unlock(); } /** * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed * @new_cpus: the temp variable for the new effective_cpus mask * * Iterate through each task of @cs updating its cpus_allowed to the * effective cpuset's. As this function is called with cpuset_mutex held, * cpuset membership stays stable. For top_cpuset, task_cpu_possible_mask() * is used instead of effective_cpus to make sure all offline CPUs are also * included as hotplug code won't update cpumasks for tasks in top_cpuset. */ static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) { struct css_task_iter it; struct task_struct *task; bool top_cs = cs == &top_cpuset; css_task_iter_start(&cs->css, 0, &it); while ((task = css_task_iter_next(&it))) { const struct cpumask *possible_mask = task_cpu_possible_mask(task); if (top_cs) { /* * Percpu kthreads in top_cpuset are ignored */ if (kthread_is_per_cpu(task)) continue; cpumask_andnot(new_cpus, possible_mask, subpartitions_cpus); } else { cpumask_and(new_cpus, possible_mask, cs->effective_cpus); } set_cpus_allowed_ptr(task, new_cpus); } css_task_iter_end(&it); } /** * compute_effective_cpumask - Compute the effective cpumask of the cpuset * @new_cpus: the temp variable for the new effective_cpus mask * @cs: the cpuset the need to recompute the new effective_cpus mask * @parent: the parent cpuset * * The result is valid only if the given cpuset isn't a partition root. */ static void compute_effective_cpumask(struct cpumask *new_cpus, struct cpuset *cs, struct cpuset *parent) { cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus); } /* * Commands for update_parent_effective_cpumask */ enum partition_cmd { partcmd_enable, /* Enable partition root */ partcmd_enablei, /* Enable isolated partition root */ partcmd_disable, /* Disable partition root */ partcmd_update, /* Update parent's effective_cpus */ partcmd_invalidate, /* Make partition invalid */ }; static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on); static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, struct tmpmasks *tmp); /* * Update partition exclusive flag * * Return: 0 if successful, an error code otherwise */ static int update_partition_exclusive(struct cpuset *cs, int new_prs) { bool exclusive = (new_prs > 0); if (exclusive && !is_cpu_exclusive(cs)) { if (update_flag(CS_CPU_EXCLUSIVE, cs, 1)) return PERR_NOTEXCL; } else if (!exclusive && is_cpu_exclusive(cs)) { /* Turning off CS_CPU_EXCLUSIVE will not return error */ update_flag(CS_CPU_EXCLUSIVE, cs, 0); } return 0; } /* * Update partition load balance flag and/or rebuild sched domain * * Changing load balance flag will automatically call * rebuild_sched_domains_locked(). * This function is for cgroup v2 only. */ static void update_partition_sd_lb(struct cpuset *cs, int old_prs) { int new_prs = cs->partition_root_state; bool rebuild_domains = (new_prs > 0) || (old_prs > 0); bool new_lb; /* * If cs is not a valid partition root, the load balance state * will follow its parent. */ if (new_prs > 0) { new_lb = (new_prs != PRS_ISOLATED); } else { new_lb = is_sched_load_balance(parent_cs(cs)); } if (new_lb != !!is_sched_load_balance(cs)) { rebuild_domains = true; if (new_lb) set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); else clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); } if (rebuild_domains) rebuild_sched_domains_locked(); } /* * tasks_nocpu_error - Return true if tasks will have no effective_cpus */ static bool tasks_nocpu_error(struct cpuset *parent, struct cpuset *cs, struct cpumask *xcpus) { /* * A populated partition (cs or parent) can't have empty effective_cpus */ return (cpumask_subset(parent->effective_cpus, xcpus) && partition_is_populated(parent, cs)) || (!cpumask_intersects(xcpus, cpu_active_mask) && partition_is_populated(cs, NULL)); } static void reset_partition_data(struct cpuset *cs) { struct cpuset *parent = parent_cs(cs); if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) return; lockdep_assert_held(&callback_lock); cs->nr_subparts = 0; if (cpumask_empty(cs->exclusive_cpus)) { cpumask_clear(cs->effective_xcpus); if (is_cpu_exclusive(cs)) clear_bit(CS_CPU_EXCLUSIVE, &cs->flags); } if (!cpumask_and(cs->effective_cpus, parent->effective_cpus, cs->cpus_allowed)) { cs->use_parent_ecpus = true; parent->child_ecpus_count++; cpumask_copy(cs->effective_cpus, parent->effective_cpus); } } /* * partition_xcpus_newstate - Exclusive CPUs state change * @old_prs: old partition_root_state * @new_prs: new partition_root_state * @xcpus: exclusive CPUs with state change */ static void partition_xcpus_newstate(int old_prs, int new_prs, struct cpumask *xcpus) { WARN_ON_ONCE(old_prs == new_prs); if (new_prs == PRS_ISOLATED) cpumask_or(isolated_cpus, isolated_cpus, xcpus); else cpumask_andnot(isolated_cpus, isolated_cpus, xcpus); } /* * partition_xcpus_add - Add new exclusive CPUs to partition * @new_prs: new partition_root_state * @parent: parent cpuset * @xcpus: exclusive CPUs to be added * Return: true if isolated_cpus modified, false otherwise * * Remote partition if parent == NULL */ static bool partition_xcpus_add(int new_prs, struct cpuset *parent, struct cpumask *xcpus) { bool isolcpus_updated; WARN_ON_ONCE(new_prs < 0); lockdep_assert_held(&callback_lock); if (!parent) parent = &top_cpuset; if (parent == &top_cpuset) cpumask_or(subpartitions_cpus, subpartitions_cpus, xcpus); isolcpus_updated = (new_prs != parent->partition_root_state); if (isolcpus_updated) partition_xcpus_newstate(parent->partition_root_state, new_prs, xcpus); cpumask_andnot(parent->effective_cpus, parent->effective_cpus, xcpus); return isolcpus_updated; } /* * partition_xcpus_del - Remove exclusive CPUs from partition * @old_prs: old partition_root_state * @parent: parent cpuset * @xcpus: exclusive CPUs to be removed * Return: true if isolated_cpus modified, false otherwise * * Remote partition if parent == NULL */ static bool partition_xcpus_del(int old_prs, struct cpuset *parent, struct cpumask *xcpus) { bool isolcpus_updated; WARN_ON_ONCE(old_prs < 0); lockdep_assert_held(&callback_lock); if (!parent) parent = &top_cpuset; if (parent == &top_cpuset) cpumask_andnot(subpartitions_cpus, subpartitions_cpus, xcpus); isolcpus_updated = (old_prs != parent->partition_root_state); if (isolcpus_updated) partition_xcpus_newstate(old_prs, parent->partition_root_state, xcpus); cpumask_and(xcpus, xcpus, cpu_active_mask); cpumask_or(parent->effective_cpus, parent->effective_cpus, xcpus); return isolcpus_updated; } static void update_unbound_workqueue_cpumask(bool isolcpus_updated) { int ret; lockdep_assert_cpus_held(); if (!isolcpus_updated) return; ret = workqueue_unbound_exclude_cpumask(isolated_cpus); WARN_ON_ONCE(ret < 0); } /** * cpuset_cpu_is_isolated - Check if the given CPU is isolated * @cpu: the CPU number to be checked * Return: true if CPU is used in an isolated partition, false otherwise */ bool cpuset_cpu_is_isolated(int cpu) { return cpumask_test_cpu(cpu, isolated_cpus); } EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated); /* * compute_effective_exclusive_cpumask - compute effective exclusive CPUs * @cs: cpuset * @xcpus: effective exclusive CPUs value to be set * Return: true if xcpus is not empty, false otherwise. * * Starting with exclusive_cpus (cpus_allowed if exclusive_cpus is not set), * it must be a subset of cpus_allowed and parent's effective_xcpus. */ static bool compute_effective_exclusive_cpumask(struct cpuset *cs, struct cpumask *xcpus) { struct cpuset *parent = parent_cs(cs); if (!xcpus) xcpus = cs->effective_xcpus; if (!cpumask_empty(cs->exclusive_cpus)) cpumask_and(xcpus, cs->exclusive_cpus, cs->cpus_allowed); else cpumask_copy(xcpus, cs->cpus_allowed); return cpumask_and(xcpus, xcpus, parent->effective_xcpus); } static inline bool is_remote_partition(struct cpuset *cs) { return !list_empty(&cs->remote_sibling); } static inline bool is_local_partition(struct cpuset *cs) { return is_partition_valid(cs) && !is_remote_partition(cs); } /* * remote_partition_enable - Enable current cpuset as a remote partition root * @cs: the cpuset to update * @new_prs: new partition_root_state * @tmp: temparary masks * Return: 1 if successful, 0 if error * * Enable the current cpuset to become a remote partition root taking CPUs * directly from the top cpuset. cpuset_mutex must be held by the caller. */ static int remote_partition_enable(struct cpuset *cs, int new_prs, struct tmpmasks *tmp) { bool isolcpus_updated; /* * The user must have sysadmin privilege. */ if (!capable(CAP_SYS_ADMIN)) return 0; /* * The requested exclusive_cpus must not be allocated to other * partitions and it can't use up all the root's effective_cpus. * * Note that if there is any local partition root above it or * remote partition root underneath it, its exclusive_cpus must * have overlapped with subpartitions_cpus. */ compute_effective_exclusive_cpumask(cs, tmp->new_cpus); if (cpumask_empty(tmp->new_cpus) || cpumask_intersects(tmp->new_cpus, subpartitions_cpus) || cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus)) return 0; spin_lock_irq(&callback_lock); isolcpus_updated = partition_xcpus_add(new_prs, NULL, tmp->new_cpus); list_add(&cs->remote_sibling, &remote_children); if (cs->use_parent_ecpus) { struct cpuset *parent = parent_cs(cs); cs->use_parent_ecpus = false; parent->child_ecpus_count--; } spin_unlock_irq(&callback_lock); update_unbound_workqueue_cpumask(isolcpus_updated); /* * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. */ update_tasks_cpumask(&top_cpuset, tmp->new_cpus); update_sibling_cpumasks(&top_cpuset, NULL, tmp); return 1; } /* * remote_partition_disable - Remove current cpuset from remote partition list * @cs: the cpuset to update * @tmp: temparary masks * * The effective_cpus is also updated. * * cpuset_mutex must be held by the caller. */ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) { bool isolcpus_updated; compute_effective_exclusive_cpumask(cs, tmp->new_cpus); WARN_ON_ONCE(!is_remote_partition(cs)); WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, subpartitions_cpus)); spin_lock_irq(&callback_lock); list_del_init(&cs->remote_sibling); isolcpus_updated = partition_xcpus_del(cs->partition_root_state, NULL, tmp->new_cpus); cs->partition_root_state = -cs->partition_root_state; if (!cs->prs_err) cs->prs_err = PERR_INVCPUS; reset_partition_data(cs); spin_unlock_irq(&callback_lock); update_unbound_workqueue_cpumask(isolcpus_updated); /* * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. */ update_tasks_cpumask(&top_cpuset, tmp->new_cpus); update_sibling_cpumasks(&top_cpuset, NULL, tmp); } /* * remote_cpus_update - cpus_exclusive change of remote partition * @cs: the cpuset to be updated * @newmask: the new effective_xcpus mask * @tmp: temparary masks * * top_cpuset and subpartitions_cpus will be updated or partition can be * invalidated. */ static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask, struct tmpmasks *tmp) { bool adding, deleting; int prs = cs->partition_root_state; int isolcpus_updated = 0; if (WARN_ON_ONCE(!is_remote_partition(cs))) return; WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus)); if (cpumask_empty(newmask)) goto invalidate; adding = cpumask_andnot(tmp->addmask, newmask, cs->effective_xcpus); deleting = cpumask_andnot(tmp->delmask, cs->effective_xcpus, newmask); /* * Additions of remote CPUs is only allowed if those CPUs are * not allocated to other partitions and there are effective_cpus * left in the top cpuset. */ if (adding && (!capable(CAP_SYS_ADMIN) || cpumask_intersects(tmp->addmask, subpartitions_cpus) || cpumask_subset(top_cpuset.effective_cpus, tmp->addmask))) goto invalidate; spin_lock_irq(&callback_lock); if (adding) isolcpus_updated += partition_xcpus_add(prs, NULL, tmp->addmask); if (deleting) isolcpus_updated += partition_xcpus_del(prs, NULL, tmp->delmask); spin_unlock_irq(&callback_lock); update_unbound_workqueue_cpumask(isolcpus_updated); /* * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. */ update_tasks_cpumask(&top_cpuset, tmp->new_cpus); update_sibling_cpumasks(&top_cpuset, NULL, tmp); return; invalidate: remote_partition_disable(cs, tmp); } /* * remote_partition_check - check if a child remote partition needs update * @cs: the cpuset to be updated * @newmask: the new effective_xcpus mask * @delmask: temporary mask for deletion (not in tmp) * @tmp: temparary masks * * This should be called before the given cs has updated its cpus_allowed * and/or effective_xcpus. */ static void remote_partition_check(struct cpuset *cs, struct cpumask *newmask, struct cpumask *delmask, struct tmpmasks *tmp) { struct cpuset *child, *next; int disable_cnt = 0; /* * Compute the effective exclusive CPUs that will be deleted. */ if (!cpumask_andnot(delmask, cs->effective_xcpus, newmask) || !cpumask_intersects(delmask, subpartitions_cpus)) return; /* No deletion of exclusive CPUs in partitions */ /* * Searching the remote children list to look for those that will * be impacted by the deletion of exclusive CPUs. * * Since a cpuset must be removed from the remote children list * before it can go offline and holding cpuset_mutex will prevent * any change in cpuset status. RCU read lock isn't needed. */ lockdep_assert_held(&cpuset_mutex); list_for_each_entry_safe(child, next, &remote_children, remote_sibling) if (cpumask_intersects(child->effective_cpus, delmask)) { remote_partition_disable(child, tmp); disable_cnt++; } if (disable_cnt) rebuild_sched_domains_locked(); } /* * prstate_housekeeping_conflict - check for partition & housekeeping conflicts * @prstate: partition root state to be checked * @new_cpus: cpu mask * Return: true if there is conflict, false otherwise * * CPUs outside of housekeeping_cpumask(HK_TYPE_DOMAIN) can only be used in * an isolated partition. */ static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus) { const struct cpumask *hk_domain = housekeeping_cpumask(HK_TYPE_DOMAIN); bool all_in_hk = cpumask_subset(new_cpus, hk_domain); if (!all_in_hk && (prstate != PRS_ISOLATED)) return true; return false; } /** * update_parent_effective_cpumask - update effective_cpus mask of parent cpuset * @cs: The cpuset that requests change in partition root state * @cmd: Partition root state change command * @newmask: Optional new cpumask for partcmd_update * @tmp: Temporary addmask and delmask * Return: 0 or a partition root state error code * * For partcmd_enable*, the cpuset is being transformed from a non-partition * root to a partition root. The effective_xcpus (cpus_allowed if * effective_xcpus not set) mask of the given cpuset will be taken away from * parent's effective_cpus. The function will return 0 if all the CPUs listed * in effective_xcpus can be granted or an error code will be returned. * * For partcmd_disable, the cpuset is being transformed from a partition * root back to a non-partition root. Any CPUs in effective_xcpus will be * given back to parent's effective_cpus. 0 will always be returned. * * For partcmd_update, if the optional newmask is specified, the cpu list is * to be changed from effective_xcpus to newmask. Otherwise, effective_xcpus is * assumed to remain the same. The cpuset should either be a valid or invalid * partition root. The partition root state may change from valid to invalid * or vice versa. An error code will be returned if transitioning from * invalid to valid violates the exclusivity rule. * * For partcmd_invalidate, the current partition will be made invalid. * * The partcmd_enable* and partcmd_disable commands are used by * update_prstate(). An error code may be returned and the caller will check * for error. * * The partcmd_update command is used by update_cpumasks_hier() with newmask * NULL and update_cpumask() with newmask set. The partcmd_invalidate is used * by update_cpumask() with NULL newmask. In both cases, the callers won't * check for error and so partition_root_state and prs_error will be updated * directly. */ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, struct cpumask *newmask, struct tmpmasks *tmp) { struct cpuset *parent = parent_cs(cs); int adding; /* Adding cpus to parent's effective_cpus */ int deleting; /* Deleting cpus from parent's effective_cpus */ int old_prs, new_prs; int part_error = PERR_NONE; /* Partition error? */ int subparts_delta = 0; struct cpumask *xcpus; /* cs effective_xcpus */ int isolcpus_updated = 0; bool nocpu; lockdep_assert_held(&cpuset_mutex); /* * new_prs will only be changed for the partcmd_update and * partcmd_invalidate commands. */ adding = deleting = false; old_prs = new_prs = cs->partition_root_state; xcpus = !cpumask_empty(cs->exclusive_cpus) ? cs->effective_xcpus : cs->cpus_allowed; if (cmd == partcmd_invalidate) { if (is_prs_invalid(old_prs)) return 0; /* * Make the current partition invalid. */ if (is_partition_valid(parent)) adding = cpumask_and(tmp->addmask, xcpus, parent->effective_xcpus); if (old_prs > 0) { new_prs = -old_prs; subparts_delta--; } goto write_error; } /* * The parent must be a partition root. * The new cpumask, if present, or the current cpus_allowed must * not be empty. */ if (!is_partition_valid(parent)) { return is_partition_invalid(parent) ? PERR_INVPARENT : PERR_NOTPART; } if (!newmask && cpumask_empty(cs->cpus_allowed)) return PERR_CPUSEMPTY; nocpu = tasks_nocpu_error(parent, cs, xcpus); if ((cmd == partcmd_enable) || (cmd == partcmd_enablei)) { /* * Enabling partition root is not allowed if its * effective_xcpus is empty or doesn't overlap with * parent's effective_xcpus. */ if (cpumask_empty(xcpus) || !cpumask_intersects(xcpus, parent->effective_xcpus)) return PERR_INVCPUS; if (prstate_housekeeping_conflict(new_prs, xcpus)) return PERR_HKEEPING; /* * A parent can be left with no CPU as long as there is no * task directly associated with the parent partition. */ if (nocpu) return PERR_NOCPUS; cpumask_copy(tmp->delmask, xcpus); deleting = true; subparts_delta++; new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED; } else if (cmd == partcmd_disable) { /* * May need to add cpus to parent's effective_cpus for * valid partition root. */ adding = !is_prs_invalid(old_prs) && cpumask_and(tmp->addmask, xcpus, parent->effective_xcpus); if (adding) subparts_delta--; new_prs = PRS_MEMBER; } else if (newmask) { /* * Empty cpumask is not allowed */ if (cpumask_empty(newmask)) { part_error = PERR_CPUSEMPTY; goto write_error; } /* * partcmd_update with newmask: * * Compute add/delete mask to/from effective_cpus * * For valid partition: * addmask = exclusive_cpus & ~newmask * & parent->effective_xcpus * delmask = newmask & ~exclusive_cpus * & parent->effective_xcpus * * For invalid partition: * delmask = newmask & parent->effective_xcpus */ if (is_prs_invalid(old_prs)) { adding = false; deleting = cpumask_and(tmp->delmask, newmask, parent->effective_xcpus); } else { cpumask_andnot(tmp->addmask, xcpus, newmask); adding = cpumask_and(tmp->addmask, tmp->addmask, parent->effective_xcpus); cpumask_andnot(tmp->delmask, newmask, xcpus); deleting = cpumask_and(tmp->delmask, tmp->delmask, parent->effective_xcpus); } /* * Make partition invalid if parent's effective_cpus could * become empty and there are tasks in the parent. */ if (nocpu && (!adding || !cpumask_intersects(tmp->addmask, cpu_active_mask))) { part_error = PERR_NOCPUS; deleting = false; adding = cpumask_and(tmp->addmask, xcpus, parent->effective_xcpus); } } else { /* * partcmd_update w/o newmask * * delmask = effective_xcpus & parent->effective_cpus * * This can be called from: * 1) update_cpumasks_hier() * 2) cpuset_hotplug_update_tasks() * * Check to see if it can be transitioned from valid to * invalid partition or vice versa. * * A partition error happens when parent has tasks and all * its effective CPUs will have to be distributed out. */ WARN_ON_ONCE(!is_partition_valid(parent)); if (nocpu) { part_error = PERR_NOCPUS; if (is_partition_valid(cs)) adding = cpumask_and(tmp->addmask, xcpus, parent->effective_xcpus); } else if (is_partition_invalid(cs) && cpumask_subset(xcpus, parent->effective_xcpus)) { struct cgroup_subsys_state *css; struct cpuset *child; bool exclusive = true; /* * Convert invalid partition to valid has to * pass the cpu exclusivity test. */ rcu_read_lock(); cpuset_for_each_child(child, css, parent) { if (child == cs) continue; if (!cpusets_are_exclusive(cs, child)) { exclusive = false; break; } } rcu_read_unlock(); if (exclusive) deleting = cpumask_and(tmp->delmask, xcpus, parent->effective_cpus); else part_error = PERR_NOTEXCL; } } write_error: if (part_error) WRITE_ONCE(cs->prs_err, part_error); if (cmd == partcmd_update) { /* * Check for possible transition between valid and invalid * partition root. */ switch (cs->partition_root_state) { case PRS_ROOT: case PRS_ISOLATED: if (part_error) { new_prs = -old_prs; subparts_delta--; } break; case PRS_INVALID_ROOT: case PRS_INVALID_ISOLATED: if (!part_error) { new_prs = -old_prs; subparts_delta++; } break; } } if (!adding && !deleting && (new_prs == old_prs)) return 0; /* * Transitioning between invalid to valid or vice versa may require * changing CS_CPU_EXCLUSIVE. In the case of partcmd_update, * validate_change() has already been successfully called and * CPU lists in cs haven't been updated yet. So defer it to later. */ if ((old_prs != new_prs) && (cmd != partcmd_update)) { int err = update_partition_exclusive(cs, new_prs); if (err) return err; } /* * Change the parent's effective_cpus & effective_xcpus (top cpuset * only). * * Newly added CPUs will be removed from effective_cpus and * newly deleted ones will be added back to effective_cpus. */ spin_lock_irq(&callback_lock); if (old_prs != new_prs) { cs->partition_root_state = new_prs; if (new_prs <= 0) cs->nr_subparts = 0; } /* * Adding to parent's effective_cpus means deletion CPUs from cs * and vice versa. */ if (adding) isolcpus_updated += partition_xcpus_del(old_prs, parent, tmp->addmask); if (deleting) isolcpus_updated += partition_xcpus_add(new_prs, parent, tmp->delmask); if (is_partition_valid(parent)) { parent->nr_subparts += subparts_delta; WARN_ON_ONCE(parent->nr_subparts < 0); } spin_unlock_irq(&callback_lock); update_unbound_workqueue_cpumask(isolcpus_updated); if ((old_prs != new_prs) && (cmd == partcmd_update)) update_partition_exclusive(cs, new_prs); if (adding || deleting) { update_tasks_cpumask(parent, tmp->addmask); update_sibling_cpumasks(parent, cs, tmp); } /* * For partcmd_update without newmask, it is being called from * cpuset_hotplug_workfn() where cpus_read_lock() wasn't taken. * Update the load balance flag and scheduling domain if * cpus_read_trylock() is successful. */ if ((cmd == partcmd_update) && !newmask && cpus_read_trylock()) { update_partition_sd_lb(cs, old_prs); cpus_read_unlock(); } notify_partition_change(cs, old_prs); return 0; } /** * compute_partition_effective_cpumask - compute effective_cpus for partition * @cs: partition root cpuset * @new_ecpus: previously computed effective_cpus to be updated * * Compute the effective_cpus of a partition root by scanning effective_xcpus * of child partition roots and excluding their effective_xcpus. * * This has the side effect of invalidating valid child partition roots, * if necessary. Since it is called from either cpuset_hotplug_update_tasks() * or update_cpumasks_hier() where parent and children are modified * successively, we don't need to call update_parent_effective_cpumask() * and the child's effective_cpus will be updated in later iterations. * * Note that rcu_read_lock() is assumed to be held. */ static void compute_partition_effective_cpumask(struct cpuset *cs, struct cpumask *new_ecpus) { struct cgroup_subsys_state *css; struct cpuset *child; bool populated = partition_is_populated(cs, NULL); /* * Check child partition roots to see if they should be * invalidated when * 1) child effective_xcpus not a subset of new * excluisve_cpus * 2) All the effective_cpus will be used up and cp * has tasks */ compute_effective_exclusive_cpumask(cs, new_ecpus); cpumask_and(new_ecpus, new_ecpus, cpu_active_mask); rcu_read_lock(); cpuset_for_each_child(child, css, cs) { if (!is_partition_valid(child)) continue; child->prs_err = 0; if (!cpumask_subset(child->effective_xcpus, cs->effective_xcpus)) child->prs_err = PERR_INVCPUS; else if (populated && cpumask_subset(new_ecpus, child->effective_xcpus)) child->prs_err = PERR_NOCPUS; if (child->prs_err) { int old_prs = child->partition_root_state; /* * Invalidate child partition */ spin_lock_irq(&callback_lock); make_partition_invalid(child); cs->nr_subparts--; child->nr_subparts = 0; spin_unlock_irq(&callback_lock); notify_partition_change(child, old_prs); continue; } cpumask_andnot(new_ecpus, new_ecpus, child->effective_xcpus); } rcu_read_unlock(); } /* * update_cpumasks_hier() flags */ #define HIER_CHECKALL 0x01 /* Check all cpusets with no skipping */ #define HIER_NO_SD_REBUILD 0x02 /* Don't rebuild sched domains */ /* * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree * @cs: the cpuset to consider * @tmp: temp variables for calculating effective_cpus & partition setup * @force: don't skip any descendant cpusets if set * * When configured cpumask is changed, the effective cpumasks of this cpuset * and all its descendants need to be updated. * * On legacy hierarchy, effective_cpus will be the same with cpu_allowed. * * Called with cpuset_mutex held */ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, int flags) { struct cpuset *cp; struct cgroup_subsys_state *pos_css; bool need_rebuild_sched_domains = false; int old_prs, new_prs; rcu_read_lock(); cpuset_for_each_descendant_pre(cp, pos_css, cs) { struct cpuset *parent = parent_cs(cp); bool remote = is_remote_partition(cp); bool update_parent = false; /* * Skip descendent remote partition that acquires CPUs * directly from top cpuset unless it is cs. */ if (remote && (cp != cs)) { pos_css = css_rightmost_descendant(pos_css); continue; } /* * Update effective_xcpus if exclusive_cpus set. * The case when exclusive_cpus isn't set is handled later. */ if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs)) { spin_lock_irq(&callback_lock); compute_effective_exclusive_cpumask(cp, NULL); spin_unlock_irq(&callback_lock); } old_prs = new_prs = cp->partition_root_state; if (remote || (is_partition_valid(parent) && is_partition_valid(cp))) compute_partition_effective_cpumask(cp, tmp->new_cpus); else compute_effective_cpumask(tmp->new_cpus, cp, parent); /* * A partition with no effective_cpus is allowed as long as * there is no task associated with it. Call * update_parent_effective_cpumask() to check it. */ if (is_partition_valid(cp) && cpumask_empty(tmp->new_cpus)) { update_parent = true; goto update_parent_effective; } /* * If it becomes empty, inherit the effective mask of the * parent, which is guaranteed to have some CPUs unless * it is a partition root that has explicitly distributed * out all its CPUs. */ if (is_in_v2_mode() && !remote && cpumask_empty(tmp->new_cpus)) { cpumask_copy(tmp->new_cpus, parent->effective_cpus); if (!cp->use_parent_ecpus) { cp->use_parent_ecpus = true; parent->child_ecpus_count++; } } else if (cp->use_parent_ecpus) { cp->use_parent_ecpus = false; WARN_ON_ONCE(!parent->child_ecpus_count); parent->child_ecpus_count--; } if (remote) goto get_css; /* * Skip the whole subtree if * 1) the cpumask remains the same, * 2) has no partition root state, * 3) HIER_CHECKALL flag not set, and * 4) for v2 load balance state same as its parent. */ if (!cp->partition_root_state && !(flags & HIER_CHECKALL) && cpumask_equal(tmp->new_cpus, cp->effective_cpus) && (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || (is_sched_load_balance(parent) == is_sched_load_balance(cp)))) { pos_css = css_rightmost_descendant(pos_css); continue; } update_parent_effective: /* * update_parent_effective_cpumask() should have been called * for cs already in update_cpumask(). We should also call * update_tasks_cpumask() again for tasks in the parent * cpuset if the parent's effective_cpus changes. */ if ((cp != cs) && old_prs) { switch (parent->partition_root_state) { case PRS_ROOT: case PRS_ISOLATED: update_parent = true; break; default: /* * When parent is not a partition root or is * invalid, child partition roots become * invalid too. */ if (is_partition_valid(cp)) new_prs = -cp->partition_root_state; WRITE_ONCE(cp->prs_err, is_partition_invalid(parent) ? PERR_INVPARENT : PERR_NOTPART); break; } } get_css: if (!css_tryget_online(&cp->css)) continue; rcu_read_unlock(); if (update_parent) { update_parent_effective_cpumask(cp, partcmd_update, NULL, tmp); /* * The cpuset partition_root_state may become * invalid. Capture it. */ new_prs = cp->partition_root_state; } spin_lock_irq(&callback_lock); cpumask_copy(cp->effective_cpus, tmp->new_cpus); cp->partition_root_state = new_prs; /* * Make sure effective_xcpus is properly set for a valid * partition root. */ if ((new_prs > 0) && cpumask_empty(cp->exclusive_cpus)) cpumask_and(cp->effective_xcpus, cp->cpus_allowed, parent->effective_xcpus); else if (new_prs < 0) reset_partition_data(cp); spin_unlock_irq(&callback_lock); notify_partition_change(cp, old_prs); WARN_ON(!is_in_v2_mode() && !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); update_tasks_cpumask(cp, cp->effective_cpus); /* * On default hierarchy, inherit the CS_SCHED_LOAD_BALANCE * from parent if current cpuset isn't a valid partition root * and their load balance states differ. */ if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && !is_partition_valid(cp) && (is_sched_load_balance(parent) != is_sched_load_balance(cp))) { if (is_sched_load_balance(parent)) set_bit(CS_SCHED_LOAD_BALANCE, &cp->flags); else clear_bit(CS_SCHED_LOAD_BALANCE, &cp->flags); } /* * On legacy hierarchy, if the effective cpumask of any non- * empty cpuset is changed, we need to rebuild sched domains. * On default hierarchy, the cpuset needs to be a partition * root as well. */ if (!cpumask_empty(cp->cpus_allowed) && is_sched_load_balance(cp) && (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || is_partition_valid(cp))) need_rebuild_sched_domains = true; rcu_read_lock(); css_put(&cp->css); } rcu_read_unlock(); if (need_rebuild_sched_domains && !(flags & HIER_NO_SD_REBUILD)) rebuild_sched_domains_locked(); } /** * update_sibling_cpumasks - Update siblings cpumasks * @parent: Parent cpuset * @cs: Current cpuset * @tmp: Temp variables */ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, struct tmpmasks *tmp) { struct cpuset *sibling; struct cgroup_subsys_state *pos_css; lockdep_assert_held(&cpuset_mutex); /* * Check all its siblings and call update_cpumasks_hier() * if their effective_cpus will need to be changed. * * With the addition of effective_xcpus which is a subset of * cpus_allowed. It is possible a change in parent's effective_cpus * due to a change in a child partition's effective_xcpus will impact * its siblings even if they do not inherit parent's effective_cpus * directly. * * The update_cpumasks_hier() function may sleep. So we have to * release the RCU read lock before calling it. HIER_NO_SD_REBUILD * flag is used to suppress rebuild of sched domains as the callers * will take care of that. */ rcu_read_lock(); cpuset_for_each_child(sibling, pos_css, parent) { if (sibling == cs) continue; if (!sibling->use_parent_ecpus && !is_partition_valid(sibling)) { compute_effective_cpumask(tmp->new_cpus, sibling, parent); if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus)) continue; } if (!css_tryget_online(&sibling->css)) continue; rcu_read_unlock(); update_cpumasks_hier(sibling, tmp, HIER_NO_SD_REBUILD); rcu_read_lock(); css_put(&sibling->css); } rcu_read_unlock(); } /** * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it * @cs: the cpuset to consider * @trialcs: trial cpuset * @buf: buffer of cpu numbers written to this cpuset */ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, const char *buf) { int retval; struct tmpmasks tmp; struct cpuset *parent = parent_cs(cs); bool invalidate = false; int hier_flags = 0; int old_prs = cs->partition_root_state; /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */ if (cs == &top_cpuset) return -EACCES; /* * An empty cpus_allowed is ok only if the cpuset has no tasks. * Since cpulist_parse() fails on an empty mask, we special case * that parsing. The validate_change() call ensures that cpusets * with tasks have cpus. */ if (!*buf) { cpumask_clear(trialcs->cpus_allowed); cpumask_clear(trialcs->effective_xcpus); } else { retval = cpulist_parse(buf, trialcs->cpus_allowed); if (retval < 0) return retval; if (!cpumask_subset(trialcs->cpus_allowed, top_cpuset.cpus_allowed)) return -EINVAL; /* * When exclusive_cpus isn't explicitly set, it is constrainted * by cpus_allowed and parent's effective_xcpus. Otherwise, * trialcs->effective_xcpus is used as a temporary cpumask * for checking validity of the partition root. */ if (!cpumask_empty(trialcs->exclusive_cpus) || is_partition_valid(cs)) compute_effective_exclusive_cpumask(trialcs, NULL); } /* Nothing to do if the cpus didn't change */ if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) return 0; if (alloc_cpumasks(NULL, &tmp)) return -ENOMEM; if (old_prs) { if (is_partition_valid(cs) && cpumask_empty(trialcs->effective_xcpus)) { invalidate = true; cs->prs_err = PERR_INVCPUS; } else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) { invalidate = true; cs->prs_err = PERR_HKEEPING; } else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) { invalidate = true; cs->prs_err = PERR_NOCPUS; } } /* * Check all the descendants in update_cpumasks_hier() if * effective_xcpus is to be changed. */ if (!cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus)) hier_flags = HIER_CHECKALL; retval = validate_change(cs, trialcs); if ((retval == -EINVAL) && cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) { struct cgroup_subsys_state *css; struct cpuset *cp; /* * The -EINVAL error code indicates that partition sibling * CPU exclusivity rule has been violated. We still allow * the cpumask change to proceed while invalidating the * partition. However, any conflicting sibling partitions * have to be marked as invalid too. */ invalidate = true; rcu_read_lock(); cpuset_for_each_child(cp, css, parent) { struct cpumask *xcpus = fetch_xcpus(trialcs); if (is_partition_valid(cp) && cpumask_intersects(xcpus, cp->effective_xcpus)) { rcu_read_unlock(); update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, &tmp); rcu_read_lock(); } } rcu_read_unlock(); retval = 0; } if (retval < 0) goto out_free; if (is_partition_valid(cs) || (is_partition_invalid(cs) && !invalidate)) { struct cpumask *xcpus = trialcs->effective_xcpus; if (cpumask_empty(xcpus) && is_partition_invalid(cs)) xcpus = trialcs->cpus_allowed; /* * Call remote_cpus_update() to handle valid remote partition */ if (is_remote_partition(cs)) remote_cpus_update(cs, xcpus, &tmp); else if (invalidate) update_parent_effective_cpumask(cs, partcmd_invalidate, NULL, &tmp); else update_parent_effective_cpumask(cs, partcmd_update, xcpus, &tmp); } else if (!cpumask_empty(cs->exclusive_cpus)) { /* * Use trialcs->effective_cpus as a temp cpumask */ remote_partition_check(cs, trialcs->effective_xcpus, trialcs->effective_cpus, &tmp); } spin_lock_irq(&callback_lock); cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus); if ((old_prs > 0) && !is_partition_valid(cs)) reset_partition_data(cs); spin_unlock_irq(&callback_lock); /* effective_cpus/effective_xcpus will be updated here */ update_cpumasks_hier(cs, &tmp, hier_flags); /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */ if (cs->partition_root_state) update_partition_sd_lb(cs, old_prs); out_free: free_cpumasks(NULL, &tmp); return 0; } /** * update_exclusive_cpumask - update the exclusive_cpus mask of a cpuset * @cs: the cpuset to consider * @trialcs: trial cpuset * @buf: buffer of cpu numbers written to this cpuset * * The tasks' cpumask will be updated if cs is a valid partition root. */ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, const char *buf) { int retval; struct tmpmasks tmp; struct cpuset *parent = parent_cs(cs); bool invalidate = false; int hier_flags = 0; int old_prs = cs->partition_root_state; if (!*buf) { cpumask_clear(trialcs->exclusive_cpus); cpumask_clear(trialcs->effective_xcpus); } else { retval = cpulist_parse(buf, trialcs->exclusive_cpus); if (retval < 0) return retval; if (!is_cpu_exclusive(cs)) set_bit(CS_CPU_EXCLUSIVE, &trialcs->flags); } /* Nothing to do if the CPUs didn't change */ if (cpumask_equal(cs->exclusive_cpus, trialcs->exclusive_cpus)) return 0; if (alloc_cpumasks(NULL, &tmp)) return -ENOMEM; if (*buf) compute_effective_exclusive_cpumask(trialcs, NULL); /* * Check all the descendants in update_cpumasks_hier() if * effective_xcpus is to be changed. */ if (!cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus)) hier_flags = HIER_CHECKALL; retval = validate_change(cs, trialcs); if (retval) return retval; if (old_prs) { if (cpumask_empty(trialcs->effective_xcpus)) { invalidate = true; cs->prs_err = PERR_INVCPUS; } else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) { invalidate = true; cs->prs_err = PERR_HKEEPING; } else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) { invalidate = true; cs->prs_err = PERR_NOCPUS; } if (is_remote_partition(cs)) { if (invalidate) remote_partition_disable(cs, &tmp); else remote_cpus_update(cs, trialcs->effective_xcpus, &tmp); } else if (invalidate) { update_parent_effective_cpumask(cs, partcmd_invalidate, NULL, &tmp); } else { update_parent_effective_cpumask(cs, partcmd_update, trialcs->effective_xcpus, &tmp); } } else if (!cpumask_empty(trialcs->exclusive_cpus)) { /* * Use trialcs->effective_cpus as a temp cpumask */ remote_partition_check(cs, trialcs->effective_xcpus, trialcs->effective_cpus, &tmp); } spin_lock_irq(&callback_lock); cpumask_copy(cs->exclusive_cpus, trialcs->exclusive_cpus); cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus); if ((old_prs > 0) && !is_partition_valid(cs)) reset_partition_data(cs); spin_unlock_irq(&callback_lock); /* * Call update_cpumasks_hier() to update effective_cpus/effective_xcpus * of the subtree when it is a valid partition root or effective_xcpus * is updated. */ if (is_partition_valid(cs) || hier_flags) update_cpumasks_hier(cs, &tmp, hier_flags); /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */ if (cs->partition_root_state) update_partition_sd_lb(cs, old_prs); free_cpumasks(NULL, &tmp); return 0; } /* * Migrate memory region from one set of nodes to another. This is * performed asynchronously as it can be called from process migration path * holding locks involved in process management. All mm migrations are * performed in the queued order and can be waited for by flushing * cpuset_migrate_mm_wq. */ struct cpuset_migrate_mm_work { struct work_struct work; struct mm_struct *mm; nodemask_t from; nodemask_t to; }; static void cpuset_migrate_mm_workfn(struct work_struct *work) { struct cpuset_migrate_mm_work *mwork = container_of(work, struct cpuset_migrate_mm_work, work); /* on a wq worker, no need to worry about %current's mems_allowed */ do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL); mmput(mwork->mm); kfree(mwork); } static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to) { struct cpuset_migrate_mm_work *mwork; if (nodes_equal(*from, *to)) { mmput(mm); return; } mwork = kzalloc(sizeof(*mwork), GFP_KERNEL); if (mwork) { mwork->mm = mm; mwork->from = *from; mwork->to = *to; INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn); queue_work(cpuset_migrate_mm_wq, &mwork->work); } else { mmput(mm); } } static void cpuset_post_attach(void) { flush_workqueue(cpuset_migrate_mm_wq); } /* * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy * @tsk: the task to change * @newmems: new nodes that the task will be set * * We use the mems_allowed_seq seqlock to safely update both tsk->mems_allowed * and rebind an eventual tasks' mempolicy. If the task is allocating in * parallel, it might temporarily see an empty intersection, which results in * a seqlock check and retry before OOM or allocation failure. */ static void cpuset_change_task_nodemask(struct task_struct *tsk, nodemask_t *newmems) { task_lock(tsk); local_irq_disable(); write_seqcount_begin(&tsk->mems_allowed_seq); nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); mpol_rebind_task(tsk, newmems); tsk->mems_allowed = *newmems; write_seqcount_end(&tsk->mems_allowed_seq); local_irq_enable(); task_unlock(tsk); } static void *cpuset_being_rebound; /** * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. * @cs: the cpuset in which each task's mems_allowed mask needs to be changed * * Iterate through each task of @cs updating its mems_allowed to the * effective cpuset's. As this function is called with cpuset_mutex held, * cpuset membership stays stable. */ static void update_tasks_nodemask(struct cpuset *cs) { static nodemask_t newmems; /* protected by cpuset_mutex */ struct css_task_iter it; struct task_struct *task; cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ guarantee_online_mems(cs, &newmems); /* * The mpol_rebind_mm() call takes mmap_lock, which we couldn't * take while holding tasklist_lock. Forks can happen - the * mpol_dup() cpuset_being_rebound check will catch such forks, * and rebind their vma mempolicies too. Because we still hold * the global cpuset_mutex, we know that no other rebind effort * will be contending for the global variable cpuset_being_rebound. * It's ok if we rebind the same mm twice; mpol_rebind_mm() * is idempotent. Also migrate pages in each mm to new nodes. */ css_task_iter_start(&cs->css, 0, &it); while ((task = css_task_iter_next(&it))) { struct mm_struct *mm; bool migrate; cpuset_change_task_nodemask(task, &newmems); mm = get_task_mm(task); if (!mm) continue; migrate = is_memory_migrate(cs); mpol_rebind_mm(mm, &cs->mems_allowed); if (migrate) cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems); else mmput(mm); } css_task_iter_end(&it); /* * All the tasks' nodemasks have been updated, update * cs->old_mems_allowed. */ cs->old_mems_allowed = newmems; /* We're done rebinding vmas to this cpuset's new mems_allowed. */ cpuset_being_rebound = NULL; } /* * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree * @cs: the cpuset to consider * @new_mems: a temp variable for calculating new effective_mems * * When configured nodemask is changed, the effective nodemasks of this cpuset * and all its descendants need to be updated. * * On legacy hierarchy, effective_mems will be the same with mems_allowed. * * Called with cpuset_mutex held */ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) { struct cpuset *cp; struct cgroup_subsys_state *pos_css; rcu_read_lock(); cpuset_for_each_descendant_pre(cp, pos_css, cs) { struct cpuset *parent = parent_cs(cp); nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems); /* * If it becomes empty, inherit the effective mask of the * parent, which is guaranteed to have some MEMs. */ if (is_in_v2_mode() && nodes_empty(*new_mems)) *new_mems = parent->effective_mems; /* Skip the whole subtree if the nodemask remains the same. */ if (nodes_equal(*new_mems, cp->effective_mems)) { pos_css = css_rightmost_descendant(pos_css); continue; } if (!css_tryget_online(&cp->css)) continue; rcu_read_unlock(); spin_lock_irq(&callback_lock); cp->effective_mems = *new_mems; spin_unlock_irq(&callback_lock); WARN_ON(!is_in_v2_mode() && !nodes_equal(cp->mems_allowed, cp->effective_mems)); update_tasks_nodemask(cp); rcu_read_lock(); css_put(&cp->css); } rcu_read_unlock(); } /* * Handle user request to change the 'mems' memory placement * of a cpuset. Needs to validate the request, update the * cpusets mems_allowed, and for each task in the cpuset, * update mems_allowed and rebind task's mempolicy and any vma * mempolicies and if the cpuset is marked 'memory_migrate', * migrate the tasks pages to the new memory. * * Call with cpuset_mutex held. May take callback_lock during call. * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, * lock each such tasks mm->mmap_lock, scan its vma's and rebind * their mempolicies to the cpusets new mems_allowed. */ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, const char *buf) { int retval; /* * top_cpuset.mems_allowed tracks node_stats[N_MEMORY]; * it's read-only */ if (cs == &top_cpuset) { retval = -EACCES; goto done; } /* * An empty mems_allowed is ok iff there are no tasks in the cpuset. * Since nodelist_parse() fails on an empty mask, we special case * that parsing. The validate_change() call ensures that cpusets * with tasks have memory. */ if (!*buf) { nodes_clear(trialcs->mems_allowed); } else { retval = nodelist_parse(buf, trialcs->mems_allowed); if (retval < 0) goto done; if (!nodes_subset(trialcs->mems_allowed, top_cpuset.mems_allowed)) { retval = -EINVAL; goto done; } } if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) { retval = 0; /* Too easy - nothing to do */ goto done; } retval = validate_change(cs, trialcs); if (retval < 0) goto done; check_insane_mems_config(&trialcs->mems_allowed); spin_lock_irq(&callback_lock); cs->mems_allowed = trialcs->mems_allowed; spin_unlock_irq(&callback_lock); /* use trialcs->mems_allowed as a temp variable */ update_nodemasks_hier(cs, &trialcs->mems_allowed); done: return retval; } bool current_cpuset_is_being_rebound(void) { bool ret; rcu_read_lock(); ret = task_cs(current) == cpuset_being_rebound; rcu_read_unlock(); return ret; } static int update_relax_domain_level(struct cpuset *cs, s64 val) { #ifdef CONFIG_SMP if (val < -1 || val >= sched_domain_level_max) return -EINVAL; #endif if (val != cs->relax_domain_level) { cs->relax_domain_level = val; if (!cpumask_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) rebuild_sched_domains_locked(); } return 0; } /** * update_tasks_flags - update the spread flags of tasks in the cpuset. * @cs: the cpuset in which each task's spread flags needs to be changed * * Iterate through each task of @cs updating its spread flags. As this * function is called with cpuset_mutex held, cpuset membership stays * stable. */ static void update_tasks_flags(struct cpuset *cs) { struct css_task_iter it; struct task_struct *task; css_task_iter_start(&cs->css, 0, &it); while ((task = css_task_iter_next(&it))) cpuset_update_task_spread_flags(cs, task); css_task_iter_end(&it); } /* * update_flag - read a 0 or a 1 in a file and update associated flag * bit: the bit to update (see cpuset_flagbits_t) * cs: the cpuset to update * turning_on: whether the flag is being set or cleared * * Call with cpuset_mutex held. */ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on) { struct cpuset *trialcs; int balance_flag_changed; int spread_flag_changed; int err; trialcs = alloc_trial_cpuset(cs); if (!trialcs) return -ENOMEM; if (turning_on) set_bit(bit, &trialcs->flags); else clear_bit(bit, &trialcs->flags); err = validate_change(cs, trialcs); if (err < 0) goto out; balance_flag_changed = (is_sched_load_balance(cs) != is_sched_load_balance(trialcs)); spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs)) || (is_spread_page(cs) != is_spread_page(trialcs))); spin_lock_irq(&callback_lock); cs->flags = trialcs->flags; spin_unlock_irq(&callback_lock); if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) rebuild_sched_domains_locked(); if (spread_flag_changed) update_tasks_flags(cs); out: free_cpuset(trialcs); return err; } /** * update_prstate - update partition_root_state * @cs: the cpuset to update * @new_prs: new partition root state * Return: 0 if successful, != 0 if error * * Call with cpuset_mutex held. */ static int update_prstate(struct cpuset *cs, int new_prs) { int err = PERR_NONE, old_prs = cs->partition_root_state; struct cpuset *parent = parent_cs(cs); struct tmpmasks tmpmask; bool new_xcpus_state = false; if (old_prs == new_prs) return 0; /* * Treat a previously invalid partition root as if it is a "member". */ if (new_prs && is_prs_invalid(old_prs)) old_prs = PRS_MEMBER; if (alloc_cpumasks(NULL, &tmpmask)) return -ENOMEM; /* * Setup effective_xcpus if not properly set yet, it will be cleared * later if partition becomes invalid. */ if ((new_prs > 0) && cpumask_empty(cs->exclusive_cpus)) { spin_lock_irq(&callback_lock); cpumask_and(cs->effective_xcpus, cs->cpus_allowed, parent->effective_xcpus); spin_unlock_irq(&callback_lock); } err = update_partition_exclusive(cs, new_prs); if (err) goto out; if (!old_prs) { enum partition_cmd cmd = (new_prs == PRS_ROOT) ? partcmd_enable : partcmd_enablei; /* * cpus_allowed cannot be empty. */ if (cpumask_empty(cs->cpus_allowed)) { err = PERR_CPUSEMPTY; goto out; } err = update_parent_effective_cpumask(cs, cmd, NULL, &tmpmask); /* * If an attempt to become local partition root fails, * try to become a remote partition root instead. */ if (err && remote_partition_enable(cs, new_prs, &tmpmask)) err = 0; } else if (old_prs && new_prs) { /* * A change in load balance state only, no change in cpumasks. */ new_xcpus_state = true; } else { /* * Switching back to member is always allowed even if it * disables child partitions. */ if (is_remote_partition(cs)) remote_partition_disable(cs, &tmpmask); else update_parent_effective_cpumask(cs, partcmd_disable, NULL, &tmpmask); /* * Invalidation of child partitions will be done in * update_cpumasks_hier(). */ } out: /* * Make partition invalid & disable CS_CPU_EXCLUSIVE if an error * happens. */ if (err) { new_prs = -new_prs; update_partition_exclusive(cs, new_prs); } spin_lock_irq(&callback_lock); cs->partition_root_state = new_prs; WRITE_ONCE(cs->prs_err, err); if (!is_partition_valid(cs)) reset_partition_data(cs); else if (new_xcpus_state) partition_xcpus_newstate(old_prs, new_prs, cs->effective_xcpus); spin_unlock_irq(&callback_lock); update_unbound_workqueue_cpumask(new_xcpus_state); /* Force update if switching back to member */ update_cpumasks_hier(cs, &tmpmask, !new_prs ? HIER_CHECKALL : 0); /* Update sched domains and load balance flag */ update_partition_sd_lb(cs, old_prs); notify_partition_change(cs, old_prs); free_cpumasks(NULL, &tmpmask); return 0; } /* * Frequency meter - How fast is some event occurring? * * These routines manage a digitally filtered, constant time based, * event frequency meter. There are four routines: * fmeter_init() - initialize a frequency meter. * fmeter_markevent() - called each time the event happens. * fmeter_getrate() - returns the recent rate of such events. * fmeter_update() - internal routine used to update fmeter. * * A common data structure is passed to each of these routines, * which is used to keep track of the state required to manage the * frequency meter and its digital filter. * * The filter works on the number of events marked per unit time. * The filter is single-pole low-pass recursive (IIR). The time unit * is 1 second. Arithmetic is done using 32-bit integers scaled to * simulate 3 decimal digits of precision (multiplied by 1000). * * With an FM_COEF of 933, and a time base of 1 second, the filter * has a half-life of 10 seconds, meaning that if the events quit * happening, then the rate returned from the fmeter_getrate() * will be cut in half each 10 seconds, until it converges to zero. * * It is not worth doing a real infinitely recursive filter. If more * than FM_MAXTICKS ticks have elapsed since the last filter event, * just compute FM_MAXTICKS ticks worth, by which point the level * will be stable. * * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid * arithmetic overflow in the fmeter_update() routine. * * Given the simple 32 bit integer arithmetic used, this meter works * best for reporting rates between one per millisecond (msec) and * one per 32 (approx) seconds. At constant rates faster than one * per msec it maxes out at values just under 1,000,000. At constant * rates between one per msec, and one per second it will stabilize * to a value N*1000, where N is the rate of events per second. * At constant rates between one per second and one per 32 seconds, * it will be choppy, moving up on the seconds that have an event, * and then decaying until the next event. At rates slower than * about one in 32 seconds, it decays all the way back to zero between * each event. */ #define FM_COEF 933 /* coefficient for half-life of 10 secs */ #define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */ #define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */ #define FM_SCALE 1000 /* faux fixed point scale */ /* Initialize a frequency meter */ static void fmeter_init(struct fmeter *fmp) { fmp->cnt = 0; fmp->val = 0; fmp->time = 0; spin_lock_init(&fmp->lock); } /* Internal meter update - process cnt events and update value */ static void fmeter_update(struct fmeter *fmp) { time64_t now; u32 ticks; now = ktime_get_seconds(); ticks = now - fmp->time; if (ticks == 0) return; ticks = min(FM_MAXTICKS, ticks); while (ticks-- > 0) fmp->val = (FM_COEF * fmp->val) / FM_SCALE; fmp->time = now; fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE; fmp->cnt = 0; } /* Process any previous ticks, then bump cnt by one (times scale). */ static void fmeter_markevent(struct fmeter *fmp) { spin_lock(&fmp->lock); fmeter_update(fmp); fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE); spin_unlock(&fmp->lock); } /* Process any previous ticks, then return current value. */ static int fmeter_getrate(struct fmeter *fmp) { int val; spin_lock(&fmp->lock); fmeter_update(fmp); val = fmp->val; spin_unlock(&fmp->lock); return val; } static struct cpuset *cpuset_attach_old_cs; /* * Check to see if a cpuset can accept a new task * For v1, cpus_allowed and mems_allowed can't be empty. * For v2, effective_cpus can't be empty. * Note that in v1, effective_cpus = cpus_allowed. */ static int cpuset_can_attach_check(struct cpuset *cs) { if (cpumask_empty(cs->effective_cpus) || (!is_in_v2_mode() && nodes_empty(cs->mems_allowed))) return -ENOSPC; return 0; } static void reset_migrate_dl_data(struct cpuset *cs) { cs->nr_migrate_dl_tasks = 0; cs->sum_migrate_dl_bw = 0; } /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ static int cpuset_can_attach(struct cgroup_taskset *tset) { struct cgroup_subsys_state *css; struct cpuset *cs, *oldcs; struct task_struct *task; bool cpus_updated, mems_updated; int ret; /* used later by cpuset_attach() */ cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css)); oldcs = cpuset_attach_old_cs; cs = css_cs(css); mutex_lock(&cpuset_mutex); /* Check to see if task is allowed in the cpuset */ ret = cpuset_can_attach_check(cs); if (ret) goto out_unlock; cpus_updated = !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus); mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems); cgroup_taskset_for_each(task, css, tset) { ret = task_can_attach(task); if (ret) goto out_unlock; /* * Skip rights over task check in v2 when nothing changes, * migration permission derives from hierarchy ownership in * cgroup_procs_write_permission()). */ if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || (cpus_updated || mems_updated)) { ret = security_task_setscheduler(task); if (ret) goto out_unlock; } if (dl_task(task)) { cs->nr_migrate_dl_tasks++; cs->sum_migrate_dl_bw += task->dl.dl_bw; } } if (!cs->nr_migrate_dl_tasks) goto out_success; if (!cpumask_intersects(oldcs->effective_cpus, cs->effective_cpus)) { int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus); if (unlikely(cpu >= nr_cpu_ids)) { reset_migrate_dl_data(cs); ret = -EINVAL; goto out_unlock; } ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw); if (ret) { reset_migrate_dl_data(cs); goto out_unlock; } } out_success: /* * Mark attach is in progress. This makes validate_change() fail * changes which zero cpus/mems_allowed. */ cs->attach_in_progress++; out_unlock: mutex_unlock(&cpuset_mutex); return ret; } static void cpuset_cancel_attach(struct cgroup_taskset *tset) { struct cgroup_subsys_state *css; struct cpuset *cs; cgroup_taskset_first(tset, &css); cs = css_cs(css); mutex_lock(&cpuset_mutex); cs->attach_in_progress--; if (!cs->attach_in_progress) wake_up(&cpuset_attach_wq); if (cs->nr_migrate_dl_tasks) { int cpu = cpumask_any(cs->effective_cpus); dl_bw_free(cpu, cs->sum_migrate_dl_bw); reset_migrate_dl_data(cs); } mutex_unlock(&cpuset_mutex); } /* * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach_task() * but we can't allocate it dynamically there. Define it global and * allocate from cpuset_init(). */ static cpumask_var_t cpus_attach; static nodemask_t cpuset_attach_nodemask_to; static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task) { lockdep_assert_held(&cpuset_mutex); if (cs != &top_cpuset) guarantee_online_cpus(task, cpus_attach); else cpumask_andnot(cpus_attach, task_cpu_possible_mask(task), subpartitions_cpus); /* * can_attach beforehand should guarantee that this doesn't * fail. TODO: have a better way to handle failure here */ WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); cpuset_update_task_spread_flags(cs, task); } static void cpuset_attach(struct cgroup_taskset *tset) { struct task_struct *task; struct task_struct *leader; struct cgroup_subsys_state *css; struct cpuset *cs; struct cpuset *oldcs = cpuset_attach_old_cs; bool cpus_updated, mems_updated; cgroup_taskset_first(tset, &css); cs = css_cs(css); lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */ mutex_lock(&cpuset_mutex); cpus_updated = !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus); mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems); /* * In the default hierarchy, enabling cpuset in the child cgroups * will trigger a number of cpuset_attach() calls with no change * in effective cpus and mems. In that case, we can optimize out * by skipping the task iteration and update. */ if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && !cpus_updated && !mems_updated) { cpuset_attach_nodemask_to = cs->effective_mems; goto out; } guarantee_online_mems(cs, &cpuset_attach_nodemask_to); cgroup_taskset_for_each(task, css, tset) cpuset_attach_task(cs, task); /* * Change mm for all threadgroup leaders. This is expensive and may * sleep and should be moved outside migration path proper. Skip it * if there is no change in effective_mems and CS_MEMORY_MIGRATE is * not set. */ cpuset_attach_nodemask_to = cs->effective_mems; if (!is_memory_migrate(cs) && !mems_updated) goto out; cgroup_taskset_for_each_leader(leader, css, tset) { struct mm_struct *mm = get_task_mm(leader); if (mm) { mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); /* * old_mems_allowed is the same with mems_allowed * here, except if this task is being moved * automatically due to hotplug. In that case * @mems_allowed has been updated and is empty, so * @old_mems_allowed is the right nodesets that we * migrate mm from. */ if (is_memory_migrate(cs)) cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, &cpuset_attach_nodemask_to); else mmput(mm); } } out: cs->old_mems_allowed = cpuset_attach_nodemask_to; if (cs->nr_migrate_dl_tasks) { cs->nr_deadline_tasks += cs->nr_migrate_dl_tasks; oldcs->nr_deadline_tasks -= cs->nr_migrate_dl_tasks; reset_migrate_dl_data(cs); } cs->attach_in_progress--; if (!cs->attach_in_progress) wake_up(&cpuset_attach_wq); mutex_unlock(&cpuset_mutex); } /* The various types of files and directories in a cpuset file system */ typedef enum { FILE_MEMORY_MIGRATE, FILE_CPULIST, FILE_MEMLIST, FILE_EFFECTIVE_CPULIST, FILE_EFFECTIVE_MEMLIST, FILE_SUBPARTS_CPULIST, FILE_EXCLUSIVE_CPULIST, FILE_EFFECTIVE_XCPULIST, FILE_ISOLATED_CPULIST, FILE_CPU_EXCLUSIVE, FILE_MEM_EXCLUSIVE, FILE_MEM_HARDWALL, FILE_SCHED_LOAD_BALANCE, FILE_PARTITION_ROOT, FILE_SCHED_RELAX_DOMAIN_LEVEL, FILE_MEMORY_PRESSURE_ENABLED, FILE_MEMORY_PRESSURE, FILE_SPREAD_PAGE, FILE_SPREAD_SLAB, } cpuset_filetype_t; static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { struct cpuset *cs = css_cs(css); cpuset_filetype_t type = cft->private; int retval = 0; cpus_read_lock(); mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) { retval = -ENODEV; goto out_unlock; } switch (type) { case FILE_CPU_EXCLUSIVE: retval = update_flag(CS_CPU_EXCLUSIVE, cs, val); break; case FILE_MEM_EXCLUSIVE: retval = update_flag(CS_MEM_EXCLUSIVE, cs, val); break; case FILE_MEM_HARDWALL: retval = update_flag(CS_MEM_HARDWALL, cs, val); break; case FILE_SCHED_LOAD_BALANCE: retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val); break; case FILE_MEMORY_MIGRATE: retval = update_flag(CS_MEMORY_MIGRATE, cs, val); break; case FILE_MEMORY_PRESSURE_ENABLED: cpuset_memory_pressure_enabled = !!val; break; case FILE_SPREAD_PAGE: retval = update_flag(CS_SPREAD_PAGE, cs, val); break; case FILE_SPREAD_SLAB: retval = update_flag(CS_SPREAD_SLAB, cs, val); break; default: retval = -EINVAL; break; } out_unlock: mutex_unlock(&cpuset_mutex); cpus_read_unlock(); return retval; } static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, s64 val) { struct cpuset *cs = css_cs(css); cpuset_filetype_t type = cft->private; int retval = -ENODEV; cpus_read_lock(); mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) goto out_unlock; switch (type) { case FILE_SCHED_RELAX_DOMAIN_LEVEL: retval = update_relax_domain_level(cs, val); break; default: retval = -EINVAL; break; } out_unlock: mutex_unlock(&cpuset_mutex); cpus_read_unlock(); return retval; } /* * Common handling for a write to a "cpus" or "mems" file. */ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cpuset *cs = css_cs(of_css(of)); struct cpuset *trialcs; int retval = -ENODEV; buf = strstrip(buf); /* * CPU or memory hotunplug may leave @cs w/o any execution * resources, in which case the hotplug code asynchronously updates * configuration and transfers all tasks to the nearest ancestor * which can execute. * * As writes to "cpus" or "mems" may restore @cs's execution * resources, wait for the previously scheduled operations before * proceeding, so that we don't end up keep removing tasks added * after execution capability is restored. * * cpuset_hotplug_work calls back into cgroup core via * cgroup_transfer_tasks() and waiting for it from a cgroupfs * operation like this one can lead to a deadlock through kernfs * active_ref protection. Let's break the protection. Losing the * protection is okay as we check whether @cs is online after * grabbing cpuset_mutex anyway. This only happens on the legacy * hierarchies. */ css_get(&cs->css); kernfs_break_active_protection(of->kn); flush_work(&cpuset_hotplug_work); cpus_read_lock(); mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) goto out_unlock; trialcs = alloc_trial_cpuset(cs); if (!trialcs) { retval = -ENOMEM; goto out_unlock; } switch (of_cft(of)->private) { case FILE_CPULIST: retval = update_cpumask(cs, trialcs, buf); break; case FILE_EXCLUSIVE_CPULIST: retval = update_exclusive_cpumask(cs, trialcs, buf); break; case FILE_MEMLIST: retval = update_nodemask(cs, trialcs, buf); break; default: retval = -EINVAL; break; } free_cpuset(trialcs); out_unlock: mutex_unlock(&cpuset_mutex); cpus_read_unlock(); kernfs_unbreak_active_protection(of->kn); css_put(&cs->css); flush_workqueue(cpuset_migrate_mm_wq); return retval ?: nbytes; } /* * These ascii lists should be read in a single call, by using a user * buffer large enough to hold the entire map. If read in smaller * chunks, there is no guarantee of atomicity. Since the display format * used, list of ranges of sequential numbers, is variable length, * and since these maps can change value dynamically, one could read * gibberish by doing partial reads while a list was changing. */ static int cpuset_common_seq_show(struct seq_file *sf, void *v) { struct cpuset *cs = css_cs(seq_css(sf)); cpuset_filetype_t type = seq_cft(sf)->private; int ret = 0; spin_lock_irq(&callback_lock); switch (type) { case FILE_CPULIST: seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed)); break; case FILE_MEMLIST: seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed)); break; case FILE_EFFECTIVE_CPULIST: seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus)); break; case FILE_EFFECTIVE_MEMLIST: seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems)); break; case FILE_EXCLUSIVE_CPULIST: seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->exclusive_cpus)); break; case FILE_EFFECTIVE_XCPULIST: seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_xcpus)); break; case FILE_SUBPARTS_CPULIST: seq_printf(sf, "%*pbl\n", cpumask_pr_args(subpartitions_cpus)); break; case FILE_ISOLATED_CPULIST: seq_printf(sf, "%*pbl\n", cpumask_pr_args(isolated_cpus)); break; default: ret = -EINVAL; } spin_unlock_irq(&callback_lock); return ret; } static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) { struct cpuset *cs = css_cs(css); cpuset_filetype_t type = cft->private; switch (type) { case FILE_CPU_EXCLUSIVE: return is_cpu_exclusive(cs); case FILE_MEM_EXCLUSIVE: return is_mem_exclusive(cs); case FILE_MEM_HARDWALL: return is_mem_hardwall(cs); case FILE_SCHED_LOAD_BALANCE: return is_sched_load_balance(cs); case FILE_MEMORY_MIGRATE: return is_memory_migrate(cs); case FILE_MEMORY_PRESSURE_ENABLED: return cpuset_memory_pressure_enabled; case FILE_MEMORY_PRESSURE: return fmeter_getrate(&cs->fmeter); case FILE_SPREAD_PAGE: return is_spread_page(cs); case FILE_SPREAD_SLAB: return is_spread_slab(cs); default: BUG(); } /* Unreachable but makes gcc happy */ return 0; } static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) { struct cpuset *cs = css_cs(css); cpuset_filetype_t type = cft->private; switch (type) { case FILE_SCHED_RELAX_DOMAIN_LEVEL: return cs->relax_domain_level; default: BUG(); } /* Unreachable but makes gcc happy */ return 0; } static int sched_partition_show(struct seq_file *seq, void *v) { struct cpuset *cs = css_cs(seq_css(seq)); const char *err, *type = NULL; switch (cs->partition_root_state) { case PRS_ROOT: seq_puts(seq, "root\n"); break; case PRS_ISOLATED: seq_puts(seq, "isolated\n"); break; case PRS_MEMBER: seq_puts(seq, "member\n"); break; case PRS_INVALID_ROOT: type = "root"; fallthrough; case PRS_INVALID_ISOLATED: if (!type) type = "isolated"; err = perr_strings[READ_ONCE(cs->prs_err)]; if (err) seq_printf(seq, "%s invalid (%s)\n", type, err); else seq_printf(seq, "%s invalid\n", type); break; } return 0; } static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cpuset *cs = css_cs(of_css(of)); int val; int retval = -ENODEV; buf = strstrip(buf); /* * Convert "root" to ENABLED, and convert "member" to DISABLED. */ if (!strcmp(buf, "root")) val = PRS_ROOT; else if (!strcmp(buf, "member")) val = PRS_MEMBER; else if (!strcmp(buf, "isolated")) val = PRS_ISOLATED; else return -EINVAL; css_get(&cs->css); cpus_read_lock(); mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) goto out_unlock; retval = update_prstate(cs, val); out_unlock: mutex_unlock(&cpuset_mutex); cpus_read_unlock(); css_put(&cs->css); return retval ?: nbytes; } /* * for the common functions, 'private' gives the type of file */ static struct cftype legacy_files[] = { { .name = "cpus", .seq_show = cpuset_common_seq_show, .write = cpuset_write_resmask, .max_write_len = (100U + 6 * NR_CPUS), .private = FILE_CPULIST, }, { .name = "mems", .seq_show = cpuset_common_seq_show, .write = cpuset_write_resmask, .max_write_len = (100U + 6 * MAX_NUMNODES), .private = FILE_MEMLIST, }, { .name = "effective_cpus", .seq_show = cpuset_common_seq_show, .private = FILE_EFFECTIVE_CPULIST, }, { .name = "effective_mems", .seq_show = cpuset_common_seq_show, .private = FILE_EFFECTIVE_MEMLIST, }, { .name = "cpu_exclusive", .read_u64 = cpuset_read_u64, .write_u64 = cpuset_write_u64, .private = FILE_CPU_EXCLUSIVE, }, { .name = "mem_exclusive", .read_u64 = cpuset_read_u64, .write_u64 = cpuset_write_u64, .private = FILE_MEM_EXCLUSIVE, }, { .name = "mem_hardwall", .read_u64 = cpuset_read_u64, .write_u64 = cpuset_write_u64, .private = FILE_MEM_HARDWALL, }, { .name = "sched_load_balance", .read_u64 = cpuset_read_u64, .write_u64 = cpuset_write_u64, .private = FILE_SCHED_LOAD_BALANCE, }, { .name = "sched_relax_domain_level", .read_s64 = cpuset_read_s64, .write_s64 = cpuset_write_s64, .private = FILE_SCHED_RELAX_DOMAIN_LEVEL, }, { .name = "memory_migrate", .read_u64 = cpuset_read_u64, .write_u64 = cpuset_write_u64, .private = FILE_MEMORY_MIGRATE, }, { .name = "memory_pressure", .read_u64 = cpuset_read_u64, .private = FILE_MEMORY_PRESSURE, }, { .name = "memory_spread_page", .read_u64 = cpuset_read_u64, .write_u64 = cpuset_write_u64, .private = FILE_SPREAD_PAGE, }, { .name = "memory_spread_slab", .read_u64 = cpuset_read_u64, .write_u64 = cpuset_write_u64, .private = FILE_SPREAD_SLAB, }, { .name = "memory_pressure_enabled", .flags = CFTYPE_ONLY_ON_ROOT, .read_u64 = cpuset_read_u64, .write_u64 = cpuset_write_u64, .private = FILE_MEMORY_PRESSURE_ENABLED, }, { } /* terminate */ }; /* * This is currently a minimal set for the default hierarchy. It can be * expanded later on by migrating more features and control files from v1. */ static struct cftype dfl_files[] = { { .name = "cpus", .seq_show = cpuset_common_seq_show, .write = cpuset_write_resmask, .max_write_len = (100U + 6 * NR_CPUS), .private = FILE_CPULIST, .flags = CFTYPE_NOT_ON_ROOT, }, { .name = "mems", .seq_show = cpuset_common_seq_show, .write = cpuset_write_resmask, .max_write_len = (100U + 6 * MAX_NUMNODES), .private = FILE_MEMLIST, .flags = CFTYPE_NOT_ON_ROOT, }, { .name = "cpus.effective", .seq_show = cpuset_common_seq_show, .private = FILE_EFFECTIVE_CPULIST, }, { .name = "mems.effective", .seq_show = cpuset_common_seq_show, .private = FILE_EFFECTIVE_MEMLIST, }, { .name = "cpus.partition", .seq_show = sched_partition_show, .write = sched_partition_write, .private = FILE_PARTITION_ROOT, .flags = CFTYPE_NOT_ON_ROOT, .file_offset = offsetof(struct cpuset, partition_file), }, { .name = "cpus.exclusive", .seq_show = cpuset_common_seq_show, .write = cpuset_write_resmask, .max_write_len = (100U + 6 * NR_CPUS), .private = FILE_EXCLUSIVE_CPULIST, .flags = CFTYPE_NOT_ON_ROOT, }, { .name = "cpus.exclusive.effective", .seq_show = cpuset_common_seq_show, .private = FILE_EFFECTIVE_XCPULIST, .flags = CFTYPE_NOT_ON_ROOT, }, { .name = "cpus.subpartitions", .seq_show = cpuset_common_seq_show, .private = FILE_SUBPARTS_CPULIST, .flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_DEBUG, }, { .name = "cpus.isolated", .seq_show = cpuset_common_seq_show, .private = FILE_ISOLATED_CPULIST, .flags = CFTYPE_ONLY_ON_ROOT, }, { } /* terminate */ }; /** * cpuset_css_alloc - Allocate a cpuset css * @parent_css: Parent css of the control group that the new cpuset will be * part of * Return: cpuset css on success, -ENOMEM on failure. * * Allocate and initialize a new cpuset css, for non-NULL @parent_css, return * top cpuset css otherwise. */ static struct cgroup_subsys_state * cpuset_css_alloc(struct cgroup_subsys_state *parent_css) { struct cpuset *cs; if (!parent_css) return &top_cpuset.css; cs = kzalloc(sizeof(*cs), GFP_KERNEL); if (!cs) return ERR_PTR(-ENOMEM); if (alloc_cpumasks(cs, NULL)) { kfree(cs); return ERR_PTR(-ENOMEM); } __set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); nodes_clear(cs->mems_allowed); nodes_clear(cs->effective_mems); fmeter_init(&cs->fmeter); cs->relax_domain_level = -1; INIT_LIST_HEAD(&cs->remote_sibling); /* Set CS_MEMORY_MIGRATE for default hierarchy */ if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) __set_bit(CS_MEMORY_MIGRATE, &cs->flags); return &cs->css; } static int cpuset_css_online(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); struct cpuset *parent = parent_cs(cs); struct cpuset *tmp_cs; struct cgroup_subsys_state *pos_css; if (!parent) return 0; cpus_read_lock(); mutex_lock(&cpuset_mutex); set_bit(CS_ONLINE, &cs->flags); if (is_spread_page(parent)) set_bit(CS_SPREAD_PAGE, &cs->flags); if (is_spread_slab(parent)) set_bit(CS_SPREAD_SLAB, &cs->flags); cpuset_inc(); spin_lock_irq(&callback_lock); if (is_in_v2_mode()) { cpumask_copy(cs->effective_cpus, parent->effective_cpus); cs->effective_mems = parent->effective_mems; cs->use_parent_ecpus = true; parent->child_ecpus_count++; /* * Clear CS_SCHED_LOAD_BALANCE if parent is isolated */ if (!is_sched_load_balance(parent)) clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); } /* * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated */ if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && !is_sched_load_balance(parent)) clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); spin_unlock_irq(&callback_lock); if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) goto out_unlock; /* * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is * set. This flag handling is implemented in cgroup core for * historical reasons - the flag may be specified during mount. * * Currently, if any sibling cpusets have exclusive cpus or mem, we * refuse to clone the configuration - thereby refusing the task to * be entered, and as a result refusing the sys_unshare() or * clone() which initiated it. If this becomes a problem for some * users who wish to allow that scenario, then this could be * changed to grant parent->cpus_allowed-sibling_cpus_exclusive * (and likewise for mems) to the new cgroup. */ rcu_read_lock(); cpuset_for_each_child(tmp_cs, pos_css, parent) { if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { rcu_read_unlock(); goto out_unlock; } } rcu_read_unlock(); spin_lock_irq(&callback_lock); cs->mems_allowed = parent->mems_allowed; cs->effective_mems = parent->mems_allowed; cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); cpumask_copy(cs->effective_cpus, parent->cpus_allowed); spin_unlock_irq(&callback_lock); out_unlock: mutex_unlock(&cpuset_mutex); cpus_read_unlock(); return 0; } /* * If the cpuset being removed has its flag 'sched_load_balance' * enabled, then simulate turning sched_load_balance off, which * will call rebuild_sched_domains_locked(). That is not needed * in the default hierarchy where only changes in partition * will cause repartitioning. * * If the cpuset has the 'sched.partition' flag enabled, simulate * turning 'sched.partition" off. */ static void cpuset_css_offline(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); cpus_read_lock(); mutex_lock(&cpuset_mutex); if (is_partition_valid(cs)) update_prstate(cs, 0); if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && is_sched_load_balance(cs)) update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); if (cs->use_parent_ecpus) { struct cpuset *parent = parent_cs(cs); cs->use_parent_ecpus = false; parent->child_ecpus_count--; } cpuset_dec(); clear_bit(CS_ONLINE, &cs->flags); mutex_unlock(&cpuset_mutex); cpus_read_unlock(); } static void cpuset_css_free(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); free_cpuset(cs); } static void cpuset_bind(struct cgroup_subsys_state *root_css) { mutex_lock(&cpuset_mutex); spin_lock_irq(&callback_lock); if (is_in_v2_mode()) { cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); cpumask_copy(top_cpuset.effective_xcpus, cpu_possible_mask); top_cpuset.mems_allowed = node_possible_map; } else { cpumask_copy(top_cpuset.cpus_allowed, top_cpuset.effective_cpus); top_cpuset.mems_allowed = top_cpuset.effective_mems; } spin_unlock_irq(&callback_lock); mutex_unlock(&cpuset_mutex); } /* * In case the child is cloned into a cpuset different from its parent, * additional checks are done to see if the move is allowed. */ static int cpuset_can_fork(struct task_struct *task, struct css_set *cset) { struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]); bool same_cs; int ret; rcu_read_lock(); same_cs = (cs == task_cs(current)); rcu_read_unlock(); if (same_cs) return 0; lockdep_assert_held(&cgroup_mutex); mutex_lock(&cpuset_mutex); /* Check to see if task is allowed in the cpuset */ ret = cpuset_can_attach_check(cs); if (ret) goto out_unlock; ret = task_can_attach(task); if (ret) goto out_unlock; ret = security_task_setscheduler(task); if (ret) goto out_unlock; /* * Mark attach is in progress. This makes validate_change() fail * changes which zero cpus/mems_allowed. */ cs->attach_in_progress++; out_unlock: mutex_unlock(&cpuset_mutex); return ret; } static void cpuset_cancel_fork(struct task_struct *task, struct css_set *cset) { struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]); bool same_cs; rcu_read_lock(); same_cs = (cs == task_cs(current)); rcu_read_unlock(); if (same_cs) return; mutex_lock(&cpuset_mutex); cs->attach_in_progress--; if (!cs->attach_in_progress) wake_up(&cpuset_attach_wq); mutex_unlock(&cpuset_mutex); } /* * Make sure the new task conform to the current state of its parent, * which could have been changed by cpuset just after it inherits the * state from the parent and before it sits on the cgroup's task list. */ static void cpuset_fork(struct task_struct *task) { struct cpuset *cs; bool same_cs; rcu_read_lock(); cs = task_cs(task); same_cs = (cs == task_cs(current)); rcu_read_unlock(); if (same_cs) { if (cs == &top_cpuset) return; set_cpus_allowed_ptr(task, current->cpus_ptr); task->mems_allowed = current->mems_allowed; return; } /* CLONE_INTO_CGROUP */ mutex_lock(&cpuset_mutex); guarantee_online_mems(cs, &cpuset_attach_nodemask_to); cpuset_attach_task(cs, task); cs->attach_in_progress--; if (!cs->attach_in_progress) wake_up(&cpuset_attach_wq); mutex_unlock(&cpuset_mutex); } struct cgroup_subsys cpuset_cgrp_subsys = { .css_alloc = cpuset_css_alloc, .css_online = cpuset_css_online, .css_offline = cpuset_css_offline, .css_free = cpuset_css_free, .can_attach = cpuset_can_attach, .cancel_attach = cpuset_cancel_attach, .attach = cpuset_attach, .post_attach = cpuset_post_attach, .bind = cpuset_bind, .can_fork = cpuset_can_fork, .cancel_fork = cpuset_cancel_fork, .fork = cpuset_fork, .legacy_cftypes = legacy_files, .dfl_cftypes = dfl_files, .early_init = true, .threaded = true, }; /** * cpuset_init - initialize cpusets at system boot * * Description: Initialize top_cpuset **/ int __init cpuset_init(void) { BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_xcpus, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&top_cpuset.exclusive_cpus, GFP_KERNEL)); BUG_ON(!zalloc_cpumask_var(&subpartitions_cpus, GFP_KERNEL)); BUG_ON(!zalloc_cpumask_var(&isolated_cpus, GFP_KERNEL)); cpumask_setall(top_cpuset.cpus_allowed); nodes_setall(top_cpuset.mems_allowed); cpumask_setall(top_cpuset.effective_cpus); cpumask_setall(top_cpuset.effective_xcpus); cpumask_setall(top_cpuset.exclusive_cpus); nodes_setall(top_cpuset.effective_mems); fmeter_init(&top_cpuset.fmeter); set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); top_cpuset.relax_domain_level = -1; INIT_LIST_HEAD(&remote_children); BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); return 0; } /* * If CPU and/or memory hotplug handlers, below, unplug any CPUs * or memory nodes, we need to walk over the cpuset hierarchy, * removing that CPU or node from all cpusets. If this removes the * last CPU or node from a cpuset, then move the tasks in the empty * cpuset to its next-highest non-empty parent. */ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) { struct cpuset *parent; /* * Find its next-highest non-empty parent, (top cpuset * has online cpus, so can't be empty). */ parent = parent_cs(cs); while (cpumask_empty(parent->cpus_allowed) || nodes_empty(parent->mems_allowed)) parent = parent_cs(parent); if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) { pr_err("cpuset: failed to transfer tasks out of empty cpuset "); pr_cont_cgroup_name(cs->css.cgroup); pr_cont("\n"); } } static void hotplug_update_tasks_legacy(struct cpuset *cs, struct cpumask *new_cpus, nodemask_t *new_mems, bool cpus_updated, bool mems_updated) { bool is_empty; spin_lock_irq(&callback_lock); cpumask_copy(cs->cpus_allowed, new_cpus); cpumask_copy(cs->effective_cpus, new_cpus); cs->mems_allowed = *new_mems; cs->effective_mems = *new_mems; spin_unlock_irq(&callback_lock); /* * Don't call update_tasks_cpumask() if the cpuset becomes empty, * as the tasks will be migrated to an ancestor. */ if (cpus_updated && !cpumask_empty(cs->cpus_allowed)) update_tasks_cpumask(cs, new_cpus); if (mems_updated && !nodes_empty(cs->mems_allowed)) update_tasks_nodemask(cs); is_empty = cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed); /* * Move tasks to the nearest ancestor with execution resources, * This is full cgroup operation which will also call back into * cpuset. Should be done outside any lock. */ if (is_empty) { mutex_unlock(&cpuset_mutex); remove_tasks_in_empty_cpuset(cs); mutex_lock(&cpuset_mutex); } } static void hotplug_update_tasks(struct cpuset *cs, struct cpumask *new_cpus, nodemask_t *new_mems, bool cpus_updated, bool mems_updated) { /* A partition root is allowed to have empty effective cpus */ if (cpumask_empty(new_cpus) && !is_partition_valid(cs)) cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus); if (nodes_empty(*new_mems)) *new_mems = parent_cs(cs)->effective_mems; spin_lock_irq(&callback_lock); cpumask_copy(cs->effective_cpus, new_cpus); cs->effective_mems = *new_mems; spin_unlock_irq(&callback_lock); if (cpus_updated) update_tasks_cpumask(cs, new_cpus); if (mems_updated) update_tasks_nodemask(cs); } static bool force_rebuild; void cpuset_force_rebuild(void) { force_rebuild = true; } /* * Attempt to acquire a cpus_read_lock while a hotplug operation may be in * progress. * Return: true if successful, false otherwise * * To avoid circular lock dependency between cpuset_mutex and cpus_read_lock, * cpus_read_trylock() is used here to acquire the lock. */ static bool cpuset_hotplug_cpus_read_trylock(void) { int retries = 0; while (!cpus_read_trylock()) { /* * CPU hotplug still in progress. Retry 5 times * with a 10ms wait before bailing out. */ if (++retries > 5) return false; msleep(10); } return true; } /** * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug * @cs: cpuset in interest * @tmp: the tmpmasks structure pointer * * Compare @cs's cpu and mem masks against top_cpuset and if some have gone * offline, update @cs accordingly. If @cs ends up with no CPU or memory, * all its tasks are moved to the nearest ancestor with both resources. */ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp) { static cpumask_t new_cpus; static nodemask_t new_mems; bool cpus_updated; bool mems_updated; bool remote; int partcmd = -1; struct cpuset *parent; retry: wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); mutex_lock(&cpuset_mutex); /* * We have raced with task attaching. We wait until attaching * is finished, so we won't attach a task to an empty cpuset. */ if (cs->attach_in_progress) { mutex_unlock(&cpuset_mutex); goto retry; } parent = parent_cs(cs); compute_effective_cpumask(&new_cpus, cs, parent); nodes_and(new_mems, cs->mems_allowed, parent->effective_mems); if (!tmp || !cs->partition_root_state) goto update_tasks; /* * Compute effective_cpus for valid partition root, may invalidate * child partition roots if necessary. */ remote = is_remote_partition(cs); if (remote || (is_partition_valid(cs) && is_partition_valid(parent))) compute_partition_effective_cpumask(cs, &new_cpus); if (remote && cpumask_empty(&new_cpus) && partition_is_populated(cs, NULL) && cpuset_hotplug_cpus_read_trylock()) { remote_partition_disable(cs, tmp); compute_effective_cpumask(&new_cpus, cs, parent); remote = false; cpuset_force_rebuild(); cpus_read_unlock(); } /* * Force the partition to become invalid if either one of * the following conditions hold: * 1) empty effective cpus but not valid empty partition. * 2) parent is invalid or doesn't grant any cpus to child * partitions. */ if (is_local_partition(cs) && (!is_partition_valid(parent) || tasks_nocpu_error(parent, cs, &new_cpus))) partcmd = partcmd_invalidate; /* * On the other hand, an invalid partition root may be transitioned * back to a regular one. */ else if (is_partition_valid(parent) && is_partition_invalid(cs)) partcmd = partcmd_update; /* * cpus_read_lock needs to be held before calling * update_parent_effective_cpumask(). To avoid circular lock * dependency between cpuset_mutex and cpus_read_lock, * cpus_read_trylock() is used here to acquire the lock. */ if (partcmd >= 0) { if (!cpuset_hotplug_cpus_read_trylock()) goto update_tasks; update_parent_effective_cpumask(cs, partcmd, NULL, tmp); cpus_read_unlock(); if ((partcmd == partcmd_invalidate) || is_partition_valid(cs)) { compute_partition_effective_cpumask(cs, &new_cpus); cpuset_force_rebuild(); } } update_tasks: cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus); mems_updated = !nodes_equal(new_mems, cs->effective_mems); if (!cpus_updated && !mems_updated) goto unlock; /* Hotplug doesn't affect this cpuset */ if (mems_updated) check_insane_mems_config(&new_mems); if (is_in_v2_mode()) hotplug_update_tasks(cs, &new_cpus, &new_mems, cpus_updated, mems_updated); else hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems, cpus_updated, mems_updated); unlock: mutex_unlock(&cpuset_mutex); } /** * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset * @work: unused * * This function is called after either CPU or memory configuration has * changed and updates cpuset accordingly. The top_cpuset is always * synchronized to cpu_active_mask and N_MEMORY, which is necessary in * order to make cpusets transparent (of no affect) on systems that are * actively using CPU hotplug but making no active use of cpusets. * * Non-root cpusets are only affected by offlining. If any CPUs or memory * nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on * all descendants. * * Note that CPU offlining during suspend is ignored. We don't modify * cpusets across suspend/resume cycles at all. */ static void cpuset_hotplug_workfn(struct work_struct *work) { static cpumask_t new_cpus; static nodemask_t new_mems; bool cpus_updated, mems_updated; bool on_dfl = is_in_v2_mode(); struct tmpmasks tmp, *ptmp = NULL; if (on_dfl && !alloc_cpumasks(NULL, &tmp)) ptmp = &tmp; mutex_lock(&cpuset_mutex); /* fetch the available cpus/mems and find out which changed how */ cpumask_copy(&new_cpus, cpu_active_mask); new_mems = node_states[N_MEMORY]; /* * If subpartitions_cpus is populated, it is likely that the check * below will produce a false positive on cpus_updated when the cpu * list isn't changed. It is extra work, but it is better to be safe. */ cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus) || !cpumask_empty(subpartitions_cpus); mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems); /* * In the rare case that hotplug removes all the cpus in * subpartitions_cpus, we assumed that cpus are updated. */ if (!cpus_updated && top_cpuset.nr_subparts) cpus_updated = true; /* For v1, synchronize cpus_allowed to cpu_active_mask */ if (cpus_updated) { spin_lock_irq(&callback_lock); if (!on_dfl) cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); /* * Make sure that CPUs allocated to child partitions * do not show up in effective_cpus. If no CPU is left, * we clear the subpartitions_cpus & let the child partitions * fight for the CPUs again. */ if (!cpumask_empty(subpartitions_cpus)) { if (cpumask_subset(&new_cpus, subpartitions_cpus)) { top_cpuset.nr_subparts = 0; cpumask_clear(subpartitions_cpus); } else { cpumask_andnot(&new_cpus, &new_cpus, subpartitions_cpus); } } cpumask_copy(top_cpuset.effective_cpus, &new_cpus); spin_unlock_irq(&callback_lock); /* we don't mess with cpumasks of tasks in top_cpuset */ } /* synchronize mems_allowed to N_MEMORY */ if (mems_updated) { spin_lock_irq(&callback_lock); if (!on_dfl) top_cpuset.mems_allowed = new_mems; top_cpuset.effective_mems = new_mems; spin_unlock_irq(&callback_lock); update_tasks_nodemask(&top_cpuset); } mutex_unlock(&cpuset_mutex); /* if cpus or mems changed, we need to propagate to descendants */ if (cpus_updated || mems_updated) { struct cpuset *cs; struct cgroup_subsys_state *pos_css; rcu_read_lock(); cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { if (cs == &top_cpuset || !css_tryget_online(&cs->css)) continue; rcu_read_unlock(); cpuset_hotplug_update_tasks(cs, ptmp); rcu_read_lock(); css_put(&cs->css); } rcu_read_unlock(); } /* rebuild sched domains if cpus_allowed has changed */ if (cpus_updated || force_rebuild) { force_rebuild = false; rebuild_sched_domains(); } free_cpumasks(NULL, ptmp); } void cpuset_update_active_cpus(void) { /* * We're inside cpu hotplug critical region which usually nests * inside cgroup synchronization. Bounce actual hotplug processing * to a work item to avoid reverse locking order. */ schedule_work(&cpuset_hotplug_work); } void cpuset_wait_for_hotplug(void) { flush_work(&cpuset_hotplug_work); } /* * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY]. * Call this routine anytime after node_states[N_MEMORY] changes. * See cpuset_update_active_cpus() for CPU hotplug handling. */ static int cpuset_track_online_nodes(struct notifier_block *self, unsigned long action, void *arg) { schedule_work(&cpuset_hotplug_work); return NOTIFY_OK; } /** * cpuset_init_smp - initialize cpus_allowed * * Description: Finish top cpuset after cpu, node maps are initialized */ void __init cpuset_init_smp(void) { /* * cpus_allowd/mems_allowed set to v2 values in the initial * cpuset_bind() call will be reset to v1 values in another * cpuset_bind() call when v1 cpuset is mounted. */ top_cpuset.old_mems_allowed = top_cpuset.mems_allowed; cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask); top_cpuset.effective_mems = node_states[N_MEMORY]; hotplug_memory_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI); cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0); BUG_ON(!cpuset_migrate_mm_wq); } /** * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. * @pmask: pointer to struct cpumask variable to receive cpus_allowed set. * * Description: Returns the cpumask_var_t cpus_allowed of the cpuset * attached to the specified @tsk. Guaranteed to return some non-empty * subset of cpu_online_mask, even if this means going outside the * tasks cpuset, except when the task is in the top cpuset. **/ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) { unsigned long flags; struct cpuset *cs; spin_lock_irqsave(&callback_lock, flags); rcu_read_lock(); cs = task_cs(tsk); if (cs != &top_cpuset) guarantee_online_cpus(tsk, pmask); /* * Tasks in the top cpuset won't get update to their cpumasks * when a hotplug online/offline event happens. So we include all * offline cpus in the allowed cpu list. */ if ((cs == &top_cpuset) || cpumask_empty(pmask)) { const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); /* * We first exclude cpus allocated to partitions. If there is no * allowable online cpu left, we fall back to all possible cpus. */ cpumask_andnot(pmask, possible_mask, subpartitions_cpus); if (!cpumask_intersects(pmask, cpu_online_mask)) cpumask_copy(pmask, possible_mask); } rcu_read_unlock(); spin_unlock_irqrestore(&callback_lock, flags); } /** * cpuset_cpus_allowed_fallback - final fallback before complete catastrophe. * @tsk: pointer to task_struct with which the scheduler is struggling * * Description: In the case that the scheduler cannot find an allowed cpu in * tsk->cpus_allowed, we fall back to task_cs(tsk)->cpus_allowed. In legacy * mode however, this value is the same as task_cs(tsk)->effective_cpus, * which will not contain a sane cpumask during cases such as cpu hotplugging. * This is the absolute last resort for the scheduler and it is only used if * _every_ other avenue has been traveled. * * Returns true if the affinity of @tsk was changed, false otherwise. **/ bool cpuset_cpus_allowed_fallback(struct task_struct *tsk) { const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); const struct cpumask *cs_mask; bool changed = false; rcu_read_lock(); cs_mask = task_cs(tsk)->cpus_allowed; if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) { do_set_cpus_allowed(tsk, cs_mask); changed = true; } rcu_read_unlock(); /* * We own tsk->cpus_allowed, nobody can change it under us. * * But we used cs && cs->cpus_allowed lockless and thus can * race with cgroup_attach_task() or update_cpumask() and get * the wrong tsk->cpus_allowed. However, both cases imply the * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr() * which takes task_rq_lock(). * * If we are called after it dropped the lock we must see all * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary * set any mask even if it is not right from task_cs() pov, * the pending set_cpus_allowed_ptr() will fix things. * * select_fallback_rq() will fix things ups and set cpu_possible_mask * if required. */ return changed; } void __init cpuset_init_current_mems_allowed(void) { nodes_setall(current->mems_allowed); } /** * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset. * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed. * * Description: Returns the nodemask_t mems_allowed of the cpuset * attached to the specified @tsk. Guaranteed to return some non-empty * subset of node_states[N_MEMORY], even if this means going outside the * tasks cpuset. **/ nodemask_t cpuset_mems_allowed(struct task_struct *tsk) { nodemask_t mask; unsigned long flags; spin_lock_irqsave(&callback_lock, flags); rcu_read_lock(); guarantee_online_mems(task_cs(tsk), &mask); rcu_read_unlock(); spin_unlock_irqrestore(&callback_lock, flags); return mask; } /** * cpuset_nodemask_valid_mems_allowed - check nodemask vs. current mems_allowed * @nodemask: the nodemask to be checked * * Are any of the nodes in the nodemask allowed in current->mems_allowed? */ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) { return nodes_intersects(*nodemask, current->mems_allowed); } /* * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or * mem_hardwall ancestor to the specified cpuset. Call holding * callback_lock. If no ancestor is mem_exclusive or mem_hardwall * (an unusual configuration), then returns the root cpuset. */ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) { while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs)) cs = parent_cs(cs); return cs; } /* * cpuset_node_allowed - Can we allocate on a memory node? * @node: is this an allowed node? * @gfp_mask: memory allocation flags * * If we're in interrupt, yes, we can always allocate. If @node is set in * current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this * node is set in the nearest hardwalled cpuset ancestor to current's cpuset, * yes. If current has access to memory reserves as an oom victim, yes. * Otherwise, no. * * GFP_USER allocations are marked with the __GFP_HARDWALL bit, * and do not allow allocations outside the current tasks cpuset * unless the task has been OOM killed. * GFP_KERNEL allocations are not so marked, so can escape to the * nearest enclosing hardwalled ancestor cpuset. * * Scanning up parent cpusets requires callback_lock. The * __alloc_pages() routine only calls here with __GFP_HARDWALL bit * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the * current tasks mems_allowed came up empty on the first pass over * the zonelist. So only GFP_KERNEL allocations, if all nodes in the * cpuset are short of memory, might require taking the callback_lock. * * The first call here from mm/page_alloc:get_page_from_freelist() * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, * so no allocation on a node outside the cpuset is allowed (unless * in interrupt, of course). * * The second pass through get_page_from_freelist() doesn't even call * here for GFP_ATOMIC calls. For those calls, the __alloc_pages() * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set * in alloc_flags. That logic and the checks below have the combined * affect that: * in_interrupt - any node ok (current task context irrelevant) * GFP_ATOMIC - any node ok * tsk_is_oom_victim - any node ok * GFP_KERNEL - any node in enclosing hardwalled cpuset ok * GFP_USER - only nodes in current tasks mems allowed ok. */ bool cpuset_node_allowed(int node, gfp_t gfp_mask) { struct cpuset *cs; /* current cpuset ancestors */ bool allowed; /* is allocation in zone z allowed? */ unsigned long flags; if (in_interrupt()) return true; if (node_isset(node, current->mems_allowed)) return true; /* * Allow tasks that have access to memory reserves because they have * been OOM killed to get memory anywhere. */ if (unlikely(tsk_is_oom_victim(current))) return true; if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ return false; if (current->flags & PF_EXITING) /* Let dying task have memory */ return true; /* Not hardwall and node outside mems_allowed: scan up cpusets */ spin_lock_irqsave(&callback_lock, flags); rcu_read_lock(); cs = nearest_hardwall_ancestor(task_cs(current)); allowed = node_isset(node, cs->mems_allowed); rcu_read_unlock(); spin_unlock_irqrestore(&callback_lock, flags); return allowed; } /** * cpuset_spread_node() - On which node to begin search for a page * @rotor: round robin rotor * * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for * tasks in a cpuset with is_spread_page or is_spread_slab set), * and if the memory allocation used cpuset_mem_spread_node() * to determine on which node to start looking, as it will for * certain page cache or slab cache pages such as used for file * system buffers and inode caches, then instead of starting on the * local node to look for a free page, rather spread the starting * node around the tasks mems_allowed nodes. * * We don't have to worry about the returned node being offline * because "it can't happen", and even if it did, it would be ok. * * The routines calling guarantee_online_mems() are careful to * only set nodes in task->mems_allowed that are online. So it * should not be possible for the following code to return an * offline node. But if it did, that would be ok, as this routine * is not returning the node where the allocation must be, only * the node where the search should start. The zonelist passed to * __alloc_pages() will include all nodes. If the slab allocator * is passed an offline node, it will fall back to the local node. * See kmem_cache_alloc_node(). */ static int cpuset_spread_node(int *rotor) { return *rotor = next_node_in(*rotor, current->mems_allowed); } /** * cpuset_mem_spread_node() - On which node to begin search for a file page */ int cpuset_mem_spread_node(void) { if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE) current->cpuset_mem_spread_rotor = node_random(¤t->mems_allowed); return cpuset_spread_node(¤t->cpuset_mem_spread_rotor); } /** * cpuset_slab_spread_node() - On which node to begin search for a slab page */ int cpuset_slab_spread_node(void) { if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE) current->cpuset_slab_spread_rotor = node_random(¤t->mems_allowed); return cpuset_spread_node(¤t->cpuset_slab_spread_rotor); } EXPORT_SYMBOL_GPL(cpuset_mem_spread_node); /** * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's? * @tsk1: pointer to task_struct of some task. * @tsk2: pointer to task_struct of some other task. * * Description: Return true if @tsk1's mems_allowed intersects the * mems_allowed of @tsk2. Used by the OOM killer to determine if * one of the task's memory usage might impact the memory available * to the other. **/ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, const struct task_struct *tsk2) { return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed); } /** * cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed * * Description: Prints current's name, cpuset name, and cached copy of its * mems_allowed to the kernel log. */ void cpuset_print_current_mems_allowed(void) { struct cgroup *cgrp; rcu_read_lock(); cgrp = task_cs(current)->css.cgroup; pr_cont(",cpuset="); pr_cont_cgroup_name(cgrp); pr_cont(",mems_allowed=%*pbl", nodemask_pr_args(¤t->mems_allowed)); rcu_read_unlock(); } /* * Collection of memory_pressure is suppressed unless * this flag is enabled by writing "1" to the special * cpuset file 'memory_pressure_enabled' in the root cpuset. */ int cpuset_memory_pressure_enabled __read_mostly; /* * __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims. * * Keep a running average of the rate of synchronous (direct) * page reclaim efforts initiated by tasks in each cpuset. * * This represents the rate at which some task in the cpuset * ran low on memory on all nodes it was allowed to use, and * had to enter the kernels page reclaim code in an effort to * create more free memory by tossing clean pages or swapping * or writing dirty pages. * * Display to user space in the per-cpuset read-only file * "memory_pressure". Value displayed is an integer * representing the recent rate of entry into the synchronous * (direct) page reclaim by any task attached to the cpuset. */ void __cpuset_memory_pressure_bump(void) { rcu_read_lock(); fmeter_markevent(&task_cs(current)->fmeter); rcu_read_unlock(); } #ifdef CONFIG_PROC_PID_CPUSET /* * proc_cpuset_show() * - Print tasks cpuset path into seq_file. * - Used for /proc/<pid>/cpuset. * - No need to task_lock(tsk) on this tsk->cpuset reference, as it * doesn't really matter if tsk->cpuset changes after we read it, * and we take cpuset_mutex, keeping cpuset_attach() from changing it * anyway. */ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *tsk) { char *buf; struct cgroup_subsys_state *css; int retval; retval = -ENOMEM; buf = kmalloc(PATH_MAX, GFP_KERNEL); if (!buf) goto out; css = task_get_css(tsk, cpuset_cgrp_id); retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX, current->nsproxy->cgroup_ns); css_put(css); if (retval == -E2BIG) retval = -ENAMETOOLONG; if (retval < 0) goto out_free; seq_puts(m, buf); seq_putc(m, '\n'); retval = 0; out_free: kfree(buf); out: return retval; } #endif /* CONFIG_PROC_PID_CPUSET */ /* Display task mems_allowed in /proc/<pid>/status file. */ void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) { seq_printf(m, "Mems_allowed:\t%*pb\n", nodemask_pr_args(&task->mems_allowed)); seq_printf(m, "Mems_allowed_list:\t%*pbl\n", nodemask_pr_args(&task->mems_allowed)); } |
53406 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_UNWIND_H #define _ASM_X86_UNWIND_H #include <linux/sched.h> #include <linux/ftrace.h> #include <linux/rethook.h> #include <asm/ptrace.h> #include <asm/stacktrace.h> #define IRET_FRAME_OFFSET (offsetof(struct pt_regs, ip)) #define IRET_FRAME_SIZE (sizeof(struct pt_regs) - IRET_FRAME_OFFSET) struct unwind_state { struct stack_info stack_info; unsigned long stack_mask; struct task_struct *task; int graph_idx; #if defined(CONFIG_RETHOOK) struct llist_node *kr_cur; #endif bool error; #if defined(CONFIG_UNWINDER_ORC) bool signal, full_regs; unsigned long sp, bp, ip; struct pt_regs *regs, *prev_regs; #elif defined(CONFIG_UNWINDER_FRAME_POINTER) bool got_irq; unsigned long *bp, *orig_sp, ip; /* * If non-NULL: The current frame is incomplete and doesn't contain a * valid BP. When looking for the next frame, use this instead of the * non-existent saved BP. */ unsigned long *next_bp; struct pt_regs *regs; #else unsigned long *sp; #endif }; void __unwind_start(struct unwind_state *state, struct task_struct *task, struct pt_regs *regs, unsigned long *first_frame); bool unwind_next_frame(struct unwind_state *state); unsigned long unwind_get_return_address(struct unwind_state *state); unsigned long *unwind_get_return_address_ptr(struct unwind_state *state); static inline bool unwind_done(struct unwind_state *state) { return state->stack_info.type == STACK_TYPE_UNKNOWN; } static inline bool unwind_error(struct unwind_state *state) { return state->error; } static inline void unwind_start(struct unwind_state *state, struct task_struct *task, struct pt_regs *regs, unsigned long *first_frame) { first_frame = first_frame ? : get_stack_pointer(task, regs); __unwind_start(state, task, regs, first_frame); } #if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER) /* * If 'partial' returns true, only the iret frame registers are valid. */ static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state, bool *partial) { if (unwind_done(state)) return NULL; if (partial) { #ifdef CONFIG_UNWINDER_ORC *partial = !state->full_regs; #else *partial = false; #endif } return state->regs; } #else static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state, bool *partial) { return NULL; } #endif #ifdef CONFIG_UNWINDER_ORC void unwind_init(void); void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, void *orc, size_t orc_size); #else static inline void unwind_init(void) {} static inline void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, void *orc, size_t orc_size) {} #endif static inline unsigned long unwind_recover_rethook(struct unwind_state *state, unsigned long addr, unsigned long *addr_p) { #ifdef CONFIG_RETHOOK if (is_rethook_trampoline(addr)) return rethook_find_ret_addr(state->task, (unsigned long)addr_p, &state->kr_cur); #endif return addr; } /* Recover the return address modified by rethook and ftrace_graph. */ static inline unsigned long unwind_recover_ret_addr(struct unwind_state *state, unsigned long addr, unsigned long *addr_p) { unsigned long ret; ret = ftrace_graph_ret_addr(state->task, &state->graph_idx, addr, addr_p); return unwind_recover_rethook(state, ret, addr_p); } /* * This disables KASAN checking when reading a value from another task's stack, * since the other task could be running on another CPU and could have poisoned * the stack in the meantime. */ #define READ_ONCE_TASK_STACK(task, x) \ ({ \ unsigned long val; \ if (task == current) \ val = READ_ONCE(x); \ else \ val = READ_ONCE_NOCHECK(x); \ val; \ }) static inline bool task_on_another_cpu(struct task_struct *task) { #ifdef CONFIG_SMP return task != current && task->on_cpu; #else return false; #endif } #endif /* _ASM_X86_UNWIND_H */ |
4163 860 284 847 839 11 4 10 11 7 846 845 849 849 285 8 828 474 832 19 23 90 848 13 22 21 12 506 186 364 5 310 20 20 20 20 19 5 17 17 11 4 17 17 849 835 17 17 17 4 17 17 17 4 16 17 838 588 588 590 445 452 283 281 117 117 117 1 243 5 19 506 572 17 4 13 519 519 9 9 1 10 508 325 325 325 325 5 11 14 15 15 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 | // SPDX-License-Identifier: GPL-2.0-only /* * mm/readahead.c - address_space-level file readahead. * * Copyright (C) 2002, Linus Torvalds * * 09Apr2002 Andrew Morton * Initial version. */ /** * DOC: Readahead Overview * * Readahead is used to read content into the page cache before it is * explicitly requested by the application. Readahead only ever * attempts to read folios that are not yet in the page cache. If a * folio is present but not up-to-date, readahead will not try to read * it. In that case a simple ->read_folio() will be requested. * * Readahead is triggered when an application read request (whether a * system call or a page fault) finds that the requested folio is not in * the page cache, or that it is in the page cache and has the * readahead flag set. This flag indicates that the folio was read * as part of a previous readahead request and now that it has been * accessed, it is time for the next readahead. * * Each readahead request is partly synchronous read, and partly async * readahead. This is reflected in the struct file_ra_state which * contains ->size being the total number of pages, and ->async_size * which is the number of pages in the async section. The readahead * flag will be set on the first folio in this async section to trigger * a subsequent readahead. Once a series of sequential reads has been * established, there should be no need for a synchronous component and * all readahead request will be fully asynchronous. * * When either of the triggers causes a readahead, three numbers need * to be determined: the start of the region to read, the size of the * region, and the size of the async tail. * * The start of the region is simply the first page address at or after * the accessed address, which is not currently populated in the page * cache. This is found with a simple search in the page cache. * * The size of the async tail is determined by subtracting the size that * was explicitly requested from the determined request size, unless * this would be less than zero - then zero is used. NOTE THIS * CALCULATION IS WRONG WHEN THE START OF THE REGION IS NOT THE ACCESSED * PAGE. ALSO THIS CALCULATION IS NOT USED CONSISTENTLY. * * The size of the region is normally determined from the size of the * previous readahead which loaded the preceding pages. This may be * discovered from the struct file_ra_state for simple sequential reads, * or from examining the state of the page cache when multiple * sequential reads are interleaved. Specifically: where the readahead * was triggered by the readahead flag, the size of the previous * readahead is assumed to be the number of pages from the triggering * page to the start of the new readahead. In these cases, the size of * the previous readahead is scaled, often doubled, for the new * readahead, though see get_next_ra_size() for details. * * If the size of the previous read cannot be determined, the number of * preceding pages in the page cache is used to estimate the size of * a previous read. This estimate could easily be misled by random * reads being coincidentally adjacent, so it is ignored unless it is * larger than the current request, and it is not scaled up, unless it * is at the start of file. * * In general readahead is accelerated at the start of the file, as * reads from there are often sequential. There are other minor * adjustments to the readahead size in various special cases and these * are best discovered by reading the code. * * The above calculation, based on the previous readahead size, * determines the size of the readahead, to which any requested read * size may be added. * * Readahead requests are sent to the filesystem using the ->readahead() * address space operation, for which mpage_readahead() is a canonical * implementation. ->readahead() should normally initiate reads on all * folios, but may fail to read any or all folios without causing an I/O * error. The page cache reading code will issue a ->read_folio() request * for any folio which ->readahead() did not read, and only an error * from this will be final. * * ->readahead() will generally call readahead_folio() repeatedly to get * each folio from those prepared for readahead. It may fail to read a * folio by: * * * not calling readahead_folio() sufficiently many times, effectively * ignoring some folios, as might be appropriate if the path to * storage is congested. * * * failing to actually submit a read request for a given folio, * possibly due to insufficient resources, or * * * getting an error during subsequent processing of a request. * * In the last two cases, the folio should be unlocked by the filesystem * to indicate that the read attempt has failed. In the first case the * folio will be unlocked by the VFS. * * Those folios not in the final ``async_size`` of the request should be * considered to be important and ->readahead() should not fail them due * to congestion or temporary resource unavailability, but should wait * for necessary resources (e.g. memory or indexing information) to * become available. Folios in the final ``async_size`` may be * considered less urgent and failure to read them is more acceptable. * In this case it is best to use filemap_remove_folio() to remove the * folios from the page cache as is automatically done for folios that * were not fetched with readahead_folio(). This will allow a * subsequent synchronous readahead request to try them again. If they * are left in the page cache, then they will be read individually using * ->read_folio() which may be less efficient. */ #include <linux/blkdev.h> #include <linux/kernel.h> #include <linux/dax.h> #include <linux/gfp.h> #include <linux/export.h> #include <linux/backing-dev.h> #include <linux/task_io_accounting_ops.h> #include <linux/pagemap.h> #include <linux/psi.h> #include <linux/syscalls.h> #include <linux/file.h> #include <linux/mm_inline.h> #include <linux/blk-cgroup.h> #include <linux/fadvise.h> #include <linux/sched/mm.h> #include "internal.h" /* * Initialise a struct file's readahead state. Assumes that the caller has * memset *ra to zero. */ void file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) { ra->ra_pages = inode_to_bdi(mapping->host)->ra_pages; ra->prev_pos = -1; } EXPORT_SYMBOL_GPL(file_ra_state_init); static void read_pages(struct readahead_control *rac) { const struct address_space_operations *aops = rac->mapping->a_ops; struct folio *folio; struct blk_plug plug; if (!readahead_count(rac)) return; if (unlikely(rac->_workingset)) psi_memstall_enter(&rac->_pflags); blk_start_plug(&plug); if (aops->readahead) { aops->readahead(rac); /* * Clean up the remaining folios. The sizes in ->ra * may be used to size the next readahead, so make sure * they accurately reflect what happened. */ while ((folio = readahead_folio(rac)) != NULL) { unsigned long nr = folio_nr_pages(folio); folio_get(folio); rac->ra->size -= nr; if (rac->ra->async_size >= nr) { rac->ra->async_size -= nr; filemap_remove_folio(folio); } folio_unlock(folio); folio_put(folio); } } else { while ((folio = readahead_folio(rac)) != NULL) aops->read_folio(rac->file, folio); } blk_finish_plug(&plug); if (unlikely(rac->_workingset)) psi_memstall_leave(&rac->_pflags); rac->_workingset = false; BUG_ON(readahead_count(rac)); } /** * page_cache_ra_unbounded - Start unchecked readahead. * @ractl: Readahead control. * @nr_to_read: The number of pages to read. * @lookahead_size: Where to start the next readahead. * * This function is for filesystems to call when they want to start * readahead beyond a file's stated i_size. This is almost certainly * not the function you want to call. Use page_cache_async_readahead() * or page_cache_sync_readahead() instead. * * Context: File is referenced by caller. Mutexes may be held by caller. * May sleep, but will not reenter filesystem to reclaim memory. */ void page_cache_ra_unbounded(struct readahead_control *ractl, unsigned long nr_to_read, unsigned long lookahead_size) { struct address_space *mapping = ractl->mapping; unsigned long index = readahead_index(ractl); gfp_t gfp_mask = readahead_gfp_mask(mapping); unsigned long i; /* * Partway through the readahead operation, we will have added * locked pages to the page cache, but will not yet have submitted * them for I/O. Adding another page may need to allocate memory, * which can trigger memory reclaim. Telling the VM we're in * the middle of a filesystem operation will cause it to not * touch file-backed pages, preventing a deadlock. Most (all?) * filesystems already specify __GFP_NOFS in their mapping's * gfp_mask, but let's be explicit here. */ unsigned int nofs = memalloc_nofs_save(); filemap_invalidate_lock_shared(mapping); /* * Preallocate as many pages as we will need. */ for (i = 0; i < nr_to_read; i++) { struct folio *folio = xa_load(&mapping->i_pages, index + i); if (folio && !xa_is_value(folio)) { /* * Page already present? Kick off the current batch * of contiguous pages before continuing with the * next batch. This page may be the one we would * have intended to mark as Readahead, but we don't * have a stable reference to this page, and it's * not worth getting one just for that. */ read_pages(ractl); ractl->_index++; i = ractl->_index + ractl->_nr_pages - index - 1; continue; } folio = filemap_alloc_folio(gfp_mask, 0); if (!folio) break; if (filemap_add_folio(mapping, folio, index + i, gfp_mask) < 0) { folio_put(folio); read_pages(ractl); ractl->_index++; i = ractl->_index + ractl->_nr_pages - index - 1; continue; } if (i == nr_to_read - lookahead_size) folio_set_readahead(folio); ractl->_workingset |= folio_test_workingset(folio); ractl->_nr_pages++; } /* * Now start the IO. We ignore I/O errors - if the folio is not * uptodate then the caller will launch read_folio again, and * will then handle the error. */ read_pages(ractl); filemap_invalidate_unlock_shared(mapping); memalloc_nofs_restore(nofs); } EXPORT_SYMBOL_GPL(page_cache_ra_unbounded); /* * do_page_cache_ra() actually reads a chunk of disk. It allocates * the pages first, then submits them for I/O. This avoids the very bad * behaviour which would occur if page allocations are causing VM writeback. * We really don't want to intermingle reads and writes like that. */ static void do_page_cache_ra(struct readahead_control *ractl, unsigned long nr_to_read, unsigned long lookahead_size) { struct inode *inode = ractl->mapping->host; unsigned long index = readahead_index(ractl); loff_t isize = i_size_read(inode); pgoff_t end_index; /* The last page we want to read */ if (isize == 0) return; end_index = (isize - 1) >> PAGE_SHIFT; if (index > end_index) return; /* Don't read past the page containing the last byte of the file */ if (nr_to_read > end_index - index) nr_to_read = end_index - index + 1; page_cache_ra_unbounded(ractl, nr_to_read, lookahead_size); } /* * Chunk the readahead into 2 megabyte units, so that we don't pin too much * memory at once. */ void force_page_cache_ra(struct readahead_control *ractl, unsigned long nr_to_read) { struct address_space *mapping = ractl->mapping; struct file_ra_state *ra = ractl->ra; struct backing_dev_info *bdi = inode_to_bdi(mapping->host); unsigned long max_pages, index; if (unlikely(!mapping->a_ops->read_folio && !mapping->a_ops->readahead)) return; /* * If the request exceeds the readahead window, allow the read to * be up to the optimal hardware IO size */ index = readahead_index(ractl); max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages); nr_to_read = min_t(unsigned long, nr_to_read, max_pages); while (nr_to_read) { unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_SIZE; if (this_chunk > nr_to_read) this_chunk = nr_to_read; ractl->_index = index; do_page_cache_ra(ractl, this_chunk, 0); index += this_chunk; nr_to_read -= this_chunk; } } /* * Set the initial window size, round to next power of 2 and square * for small size, x 4 for medium, and x 2 for large * for 128k (32 page) max ra * 1-2 page = 16k, 3-4 page 32k, 5-8 page = 64k, > 8 page = 128k initial */ static unsigned long get_init_ra_size(unsigned long size, unsigned long max) { unsigned long newsize = roundup_pow_of_two(size); if (newsize <= max / 32) newsize = newsize * 4; else if (newsize <= max / 4) newsize = newsize * 2; else newsize = max; return newsize; } /* * Get the previous window size, ramp it up, and * return it as the new window size. */ static unsigned long get_next_ra_size(struct file_ra_state *ra, unsigned long max) { unsigned long cur = ra->size; if (cur < max / 16) return 4 * cur; if (cur <= max / 2) return 2 * cur; return max; } /* * On-demand readahead design. * * The fields in struct file_ra_state represent the most-recently-executed * readahead attempt: * * |<----- async_size ---------| * |------------------- size -------------------->| * |==================#===========================| * ^start ^page marked with PG_readahead * * To overlap application thinking time and disk I/O time, we do * `readahead pipelining': Do not wait until the application consumed all * readahead pages and stalled on the missing page at readahead_index; * Instead, submit an asynchronous readahead I/O as soon as there are * only async_size pages left in the readahead window. Normally async_size * will be equal to size, for maximum pipelining. * * In interleaved sequential reads, concurrent streams on the same fd can * be invalidating each other's readahead state. So we flag the new readahead * page at (start+size-async_size) with PG_readahead, and use it as readahead * indicator. The flag won't be set on already cached pages, to avoid the * readahead-for-nothing fuss, saving pointless page cache lookups. * * prev_pos tracks the last visited byte in the _previous_ read request. * It should be maintained by the caller, and will be used for detecting * small random reads. Note that the readahead algorithm checks loosely * for sequential patterns. Hence interleaved reads might be served as * sequential ones. * * There is a special-case: if the first page which the application tries to * read happens to be the first page of the file, it is assumed that a linear * read is about to happen and the window is immediately set to the initial size * based on I/O request size and the max_readahead. * * The code ramps up the readahead size aggressively at first, but slow down as * it approaches max_readhead. */ /* * Count contiguously cached pages from @index-1 to @index-@max, * this count is a conservative estimation of * - length of the sequential read sequence, or * - thrashing threshold in memory tight systems */ static pgoff_t count_history_pages(struct address_space *mapping, pgoff_t index, unsigned long max) { pgoff_t head; rcu_read_lock(); head = page_cache_prev_miss(mapping, index - 1, max); rcu_read_unlock(); return index - 1 - head; } /* * page cache context based readahead */ static int try_context_readahead(struct address_space *mapping, struct file_ra_state *ra, pgoff_t index, unsigned long req_size, unsigned long max) { pgoff_t size; size = count_history_pages(mapping, index, max); /* * not enough history pages: * it could be a random read */ if (size <= req_size) return 0; /* * starts from beginning of file: * it is a strong indication of long-run stream (or whole-file-read) */ if (size >= index) size *= 2; ra->start = index; ra->size = min(size + req_size, max); ra->async_size = 1; return 1; } static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index, pgoff_t mark, unsigned int order, gfp_t gfp) { int err; struct folio *folio = filemap_alloc_folio(gfp, order); if (!folio) return -ENOMEM; mark = round_down(mark, 1UL << order); if (index == mark) folio_set_readahead(folio); err = filemap_add_folio(ractl->mapping, folio, index, gfp); if (err) { folio_put(folio); return err; } ractl->_nr_pages += 1UL << order; ractl->_workingset |= folio_test_workingset(folio); return 0; } void page_cache_ra_order(struct readahead_control *ractl, struct file_ra_state *ra, unsigned int new_order) { struct address_space *mapping = ractl->mapping; pgoff_t index = readahead_index(ractl); pgoff_t limit = (i_size_read(mapping->host) - 1) >> PAGE_SHIFT; pgoff_t mark = index + ra->size - ra->async_size; int err = 0; gfp_t gfp = readahead_gfp_mask(mapping); if (!mapping_large_folio_support(mapping) || ra->size < 4) goto fallback; limit = min(limit, index + ra->size - 1); if (new_order < MAX_PAGECACHE_ORDER) { new_order += 2; if (new_order > MAX_PAGECACHE_ORDER) new_order = MAX_PAGECACHE_ORDER; while ((1 << new_order) > ra->size) new_order--; } filemap_invalidate_lock_shared(mapping); while (index <= limit) { unsigned int order = new_order; /* Align with smaller pages if needed */ if (index & ((1UL << order) - 1)) order = __ffs(index); /* Don't allocate pages past EOF */ while (index + (1UL << order) - 1 > limit) order--; /* THP machinery does not support order-1 */ if (order == 1) order = 0; err = ra_alloc_folio(ractl, index, mark, order, gfp); if (err) break; index += 1UL << order; } if (index > limit) { ra->size += index - limit - 1; ra->async_size += index - limit - 1; } read_pages(ractl); filemap_invalidate_unlock_shared(mapping); /* * If there were already pages in the page cache, then we may have * left some gaps. Let the regular readahead code take care of this * situation. */ if (!err) return; fallback: do_page_cache_ra(ractl, ra->size, ra->async_size); } /* * A minimal readahead algorithm for trivial sequential/random reads. */ static void ondemand_readahead(struct readahead_control *ractl, struct folio *folio, unsigned long req_size) { struct backing_dev_info *bdi = inode_to_bdi(ractl->mapping->host); struct file_ra_state *ra = ractl->ra; unsigned long max_pages = ra->ra_pages; unsigned long add_pages; pgoff_t index = readahead_index(ractl); pgoff_t expected, prev_index; unsigned int order = folio ? folio_order(folio) : 0; /* * If the request exceeds the readahead window, allow the read to * be up to the optimal hardware IO size */ if (req_size > max_pages && bdi->io_pages > max_pages) max_pages = min(req_size, bdi->io_pages); /* * start of file */ if (!index) goto initial_readahead; /* * It's the expected callback index, assume sequential access. * Ramp up sizes, and push forward the readahead window. */ expected = round_down(ra->start + ra->size - ra->async_size, 1UL << order); if (index == expected || index == (ra->start + ra->size)) { ra->start += ra->size; ra->size = get_next_ra_size(ra, max_pages); ra->async_size = ra->size; goto readit; } /* * Hit a marked folio without valid readahead state. * E.g. interleaved reads. * Query the pagecache for async_size, which normally equals to * readahead size. Ramp it up and use it as the new readahead size. */ if (folio) { pgoff_t start; rcu_read_lock(); start = page_cache_next_miss(ractl->mapping, index + 1, max_pages); rcu_read_unlock(); if (!start || start - index > max_pages) return; ra->start = start; ra->size = start - index; /* old async_size */ ra->size += req_size; ra->size = get_next_ra_size(ra, max_pages); ra->async_size = ra->size; goto readit; } /* * oversize read */ if (req_size > max_pages) goto initial_readahead; /* * sequential cache miss * trivial case: (index - prev_index) == 1 * unaligned reads: (index - prev_index) == 0 */ prev_index = (unsigned long long)ra->prev_pos >> PAGE_SHIFT; if (index - prev_index <= 1UL) goto initial_readahead; /* * Query the page cache and look for the traces(cached history pages) * that a sequential stream would leave behind. */ if (try_context_readahead(ractl->mapping, ra, index, req_size, max_pages)) goto readit; /* * standalone, small random read * Read as is, and do not pollute the readahead state. */ do_page_cache_ra(ractl, req_size, 0); return; initial_readahead: ra->start = index; ra->size = get_init_ra_size(req_size, max_pages); ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; readit: /* * Will this read hit the readahead marker made by itself? * If so, trigger the readahead marker hit now, and merge * the resulted next readahead window into the current one. * Take care of maximum IO pages as above. */ if (index == ra->start && ra->size == ra->async_size) { add_pages = get_next_ra_size(ra, max_pages); if (ra->size + add_pages <= max_pages) { ra->async_size = add_pages; ra->size += add_pages; } else { ra->size = max_pages; ra->async_size = max_pages >> 1; } } ractl->_index = ra->start; page_cache_ra_order(ractl, ra, order); } void page_cache_sync_ra(struct readahead_control *ractl, unsigned long req_count) { bool do_forced_ra = ractl->file && (ractl->file->f_mode & FMODE_RANDOM); /* * Even if readahead is disabled, issue this request as readahead * as we'll need it to satisfy the requested range. The forced * readahead will do the right thing and limit the read to just the * requested range, which we'll set to 1 page for this case. */ if (!ractl->ra->ra_pages || blk_cgroup_congested()) { if (!ractl->file) return; req_count = 1; do_forced_ra = true; } /* be dumb */ if (do_forced_ra) { force_page_cache_ra(ractl, req_count); return; } ondemand_readahead(ractl, NULL, req_count); } EXPORT_SYMBOL_GPL(page_cache_sync_ra); void page_cache_async_ra(struct readahead_control *ractl, struct folio *folio, unsigned long req_count) { /* no readahead */ if (!ractl->ra->ra_pages) return; /* * Same bit is used for PG_readahead and PG_reclaim. */ if (folio_test_writeback(folio)) return; folio_clear_readahead(folio); if (blk_cgroup_congested()) return; ondemand_readahead(ractl, folio, req_count); } EXPORT_SYMBOL_GPL(page_cache_async_ra); ssize_t ksys_readahead(int fd, loff_t offset, size_t count) { ssize_t ret; struct fd f; ret = -EBADF; f = fdget(fd); if (!f.file || !(f.file->f_mode & FMODE_READ)) goto out; /* * The readahead() syscall is intended to run only on files * that can execute readahead. If readahead is not possible * on this file, then we must return -EINVAL. */ ret = -EINVAL; if (!f.file->f_mapping || !f.file->f_mapping->a_ops || (!S_ISREG(file_inode(f.file)->i_mode) && !S_ISBLK(file_inode(f.file)->i_mode))) goto out; ret = vfs_fadvise(f.file, offset, count, POSIX_FADV_WILLNEED); out: fdput(f); return ret; } SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count) { return ksys_readahead(fd, offset, count); } #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_READAHEAD) COMPAT_SYSCALL_DEFINE4(readahead, int, fd, compat_arg_u64_dual(offset), size_t, count) { return ksys_readahead(fd, compat_arg_u64_glue(offset), count); } #endif /** * readahead_expand - Expand a readahead request * @ractl: The request to be expanded * @new_start: The revised start * @new_len: The revised size of the request * * Attempt to expand a readahead request outwards from the current size to the * specified size by inserting locked pages before and after the current window * to increase the size to the new window. This may involve the insertion of * THPs, in which case the window may get expanded even beyond what was * requested. * * The algorithm will stop if it encounters a conflicting page already in the * pagecache and leave a smaller expansion than requested. * * The caller must check for this by examining the revised @ractl object for a * different expansion than was requested. */ void readahead_expand(struct readahead_control *ractl, loff_t new_start, size_t new_len) { struct address_space *mapping = ractl->mapping; struct file_ra_state *ra = ractl->ra; pgoff_t new_index, new_nr_pages; gfp_t gfp_mask = readahead_gfp_mask(mapping); new_index = new_start / PAGE_SIZE; /* Expand the leading edge downwards */ while (ractl->_index > new_index) { unsigned long index = ractl->_index - 1; struct folio *folio = xa_load(&mapping->i_pages, index); if (folio && !xa_is_value(folio)) return; /* Folio apparently present */ folio = filemap_alloc_folio(gfp_mask, 0); if (!folio) return; if (filemap_add_folio(mapping, folio, index, gfp_mask) < 0) { folio_put(folio); return; } if (unlikely(folio_test_workingset(folio)) && !ractl->_workingset) { ractl->_workingset = true; psi_memstall_enter(&ractl->_pflags); } ractl->_nr_pages++; ractl->_index = folio->index; } new_len += new_start - readahead_pos(ractl); new_nr_pages = DIV_ROUND_UP(new_len, PAGE_SIZE); /* Expand the trailing edge upwards */ while (ractl->_nr_pages < new_nr_pages) { unsigned long index = ractl->_index + ractl->_nr_pages; struct folio *folio = xa_load(&mapping->i_pages, index); if (folio && !xa_is_value(folio)) return; /* Folio apparently present */ folio = filemap_alloc_folio(gfp_mask, 0); if (!folio) return; if (filemap_add_folio(mapping, folio, index, gfp_mask) < 0) { folio_put(folio); return; } if (unlikely(folio_test_workingset(folio)) && !ractl->_workingset) { ractl->_workingset = true; psi_memstall_enter(&ractl->_pflags); } ractl->_nr_pages++; if (ra) { ra->size++; ra->async_size++; } } } EXPORT_SYMBOL(readahead_expand); |
11 11 6 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 | /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/ceph/ceph_debug.h> #include <linux/types.h> #include <linux/percpu_counter.h> #include <linux/math64.h> #include "metric.h" #include "mds_client.h" static void ktime_to_ceph_timespec(struct ceph_timespec *ts, ktime_t val) { struct timespec64 t = ktime_to_timespec64(val); ceph_encode_timespec64(ts, &t); } static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, struct ceph_mds_session *s) { struct ceph_metric_head *head; struct ceph_metric_cap *cap; struct ceph_metric_read_latency *read; struct ceph_metric_write_latency *write; struct ceph_metric_metadata_latency *meta; struct ceph_metric_dlease *dlease; struct ceph_opened_files *files; struct ceph_pinned_icaps *icaps; struct ceph_opened_inodes *inodes; struct ceph_read_io_size *rsize; struct ceph_write_io_size *wsize; struct ceph_client_metric *m = &mdsc->metric; u64 nr_caps = atomic64_read(&m->total_caps); u32 header_len = sizeof(struct ceph_metric_header); struct ceph_client *cl = mdsc->fsc->client; struct ceph_msg *msg; s64 sum; s32 items = 0; s32 len; /* Do not send the metrics until the MDS rank is ready */ mutex_lock(&mdsc->mutex); if (ceph_mdsmap_get_state(mdsc->mdsmap, s->s_mds) != CEPH_MDS_STATE_ACTIVE) { mutex_unlock(&mdsc->mutex); return false; } mutex_unlock(&mdsc->mutex); len = sizeof(*head) + sizeof(*cap) + sizeof(*read) + sizeof(*write) + sizeof(*meta) + sizeof(*dlease) + sizeof(*files) + sizeof(*icaps) + sizeof(*inodes) + sizeof(*rsize) + sizeof(*wsize); msg = ceph_msg_new(CEPH_MSG_CLIENT_METRICS, len, GFP_NOFS, true); if (!msg) { pr_err_client(cl, "to mds%d, failed to allocate message\n", s->s_mds); return false; } head = msg->front.iov_base; /* encode the cap metric */ cap = (struct ceph_metric_cap *)(head + 1); cap->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_CAP_INFO); cap->header.ver = 1; cap->header.compat = 1; cap->header.data_len = cpu_to_le32(sizeof(*cap) - header_len); cap->hit = cpu_to_le64(percpu_counter_sum(&m->i_caps_hit)); cap->mis = cpu_to_le64(percpu_counter_sum(&m->i_caps_mis)); cap->total = cpu_to_le64(nr_caps); items++; /* encode the read latency metric */ read = (struct ceph_metric_read_latency *)(cap + 1); read->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_LATENCY); read->header.ver = 2; read->header.compat = 1; read->header.data_len = cpu_to_le32(sizeof(*read) - header_len); sum = m->metric[METRIC_READ].latency_sum; ktime_to_ceph_timespec(&read->lat, sum); ktime_to_ceph_timespec(&read->avg, m->metric[METRIC_READ].latency_avg); read->sq_sum = cpu_to_le64(m->metric[METRIC_READ].latency_sq_sum); read->count = cpu_to_le64(m->metric[METRIC_READ].total); items++; /* encode the write latency metric */ write = (struct ceph_metric_write_latency *)(read + 1); write->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_LATENCY); write->header.ver = 2; write->header.compat = 1; write->header.data_len = cpu_to_le32(sizeof(*write) - header_len); sum = m->metric[METRIC_WRITE].latency_sum; ktime_to_ceph_timespec(&write->lat, sum); ktime_to_ceph_timespec(&write->avg, m->metric[METRIC_WRITE].latency_avg); write->sq_sum = cpu_to_le64(m->metric[METRIC_WRITE].latency_sq_sum); write->count = cpu_to_le64(m->metric[METRIC_WRITE].total); items++; /* encode the metadata latency metric */ meta = (struct ceph_metric_metadata_latency *)(write + 1); meta->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_METADATA_LATENCY); meta->header.ver = 2; meta->header.compat = 1; meta->header.data_len = cpu_to_le32(sizeof(*meta) - header_len); sum = m->metric[METRIC_METADATA].latency_sum; ktime_to_ceph_timespec(&meta->lat, sum); ktime_to_ceph_timespec(&meta->avg, m->metric[METRIC_METADATA].latency_avg); meta->sq_sum = cpu_to_le64(m->metric[METRIC_METADATA].latency_sq_sum); meta->count = cpu_to_le64(m->metric[METRIC_METADATA].total); items++; /* encode the dentry lease metric */ dlease = (struct ceph_metric_dlease *)(meta + 1); dlease->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_DENTRY_LEASE); dlease->header.ver = 1; dlease->header.compat = 1; dlease->header.data_len = cpu_to_le32(sizeof(*dlease) - header_len); dlease->hit = cpu_to_le64(percpu_counter_sum(&m->d_lease_hit)); dlease->mis = cpu_to_le64(percpu_counter_sum(&m->d_lease_mis)); dlease->total = cpu_to_le64(atomic64_read(&m->total_dentries)); items++; sum = percpu_counter_sum(&m->total_inodes); /* encode the opened files metric */ files = (struct ceph_opened_files *)(dlease + 1); files->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_OPENED_FILES); files->header.ver = 1; files->header.compat = 1; files->header.data_len = cpu_to_le32(sizeof(*files) - header_len); files->opened_files = cpu_to_le64(atomic64_read(&m->opened_files)); files->total = cpu_to_le64(sum); items++; /* encode the pinned icaps metric */ icaps = (struct ceph_pinned_icaps *)(files + 1); icaps->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_PINNED_ICAPS); icaps->header.ver = 1; icaps->header.compat = 1; icaps->header.data_len = cpu_to_le32(sizeof(*icaps) - header_len); icaps->pinned_icaps = cpu_to_le64(nr_caps); icaps->total = cpu_to_le64(sum); items++; /* encode the opened inodes metric */ inodes = (struct ceph_opened_inodes *)(icaps + 1); inodes->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_OPENED_INODES); inodes->header.ver = 1; inodes->header.compat = 1; inodes->header.data_len = cpu_to_le32(sizeof(*inodes) - header_len); inodes->opened_inodes = cpu_to_le64(percpu_counter_sum(&m->opened_inodes)); inodes->total = cpu_to_le64(sum); items++; /* encode the read io size metric */ rsize = (struct ceph_read_io_size *)(inodes + 1); rsize->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_IO_SIZES); rsize->header.ver = 1; rsize->header.compat = 1; rsize->header.data_len = cpu_to_le32(sizeof(*rsize) - header_len); rsize->total_ops = cpu_to_le64(m->metric[METRIC_READ].total); rsize->total_size = cpu_to_le64(m->metric[METRIC_READ].size_sum); items++; /* encode the write io size metric */ wsize = (struct ceph_write_io_size *)(rsize + 1); wsize->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_IO_SIZES); wsize->header.ver = 1; wsize->header.compat = 1; wsize->header.data_len = cpu_to_le32(sizeof(*wsize) - header_len); wsize->total_ops = cpu_to_le64(m->metric[METRIC_WRITE].total); wsize->total_size = cpu_to_le64(m->metric[METRIC_WRITE].size_sum); items++; put_unaligned_le32(items, &head->num); msg->front.iov_len = len; msg->hdr.version = cpu_to_le16(1); msg->hdr.compat_version = cpu_to_le16(1); msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); ceph_con_send(&s->s_con, msg); return true; } static void metric_get_session(struct ceph_mds_client *mdsc) { struct ceph_mds_session *s; int i; mutex_lock(&mdsc->mutex); for (i = 0; i < mdsc->max_sessions; i++) { s = __ceph_lookup_mds_session(mdsc, i); if (!s) continue; /* * Skip it if MDS doesn't support the metric collection, * or the MDS will close the session's socket connection * directly when it get this message. */ if (check_session_state(s) && test_bit(CEPHFS_FEATURE_METRIC_COLLECT, &s->s_features)) { mdsc->metric.session = s; break; } ceph_put_mds_session(s); } mutex_unlock(&mdsc->mutex); } static void metric_delayed_work(struct work_struct *work) { struct ceph_client_metric *m = container_of(work, struct ceph_client_metric, delayed_work.work); struct ceph_mds_client *mdsc = container_of(m, struct ceph_mds_client, metric); if (mdsc->stopping || disable_send_metrics) return; if (!m->session || !check_session_state(m->session)) { if (m->session) { ceph_put_mds_session(m->session); m->session = NULL; } metric_get_session(mdsc); } if (m->session) { ceph_mdsc_send_metrics(mdsc, m->session); metric_schedule_delayed(m); } } int ceph_metric_init(struct ceph_client_metric *m) { struct ceph_metric *metric; int ret, i; if (!m) return -EINVAL; atomic64_set(&m->total_dentries, 0); ret = percpu_counter_init(&m->d_lease_hit, 0, GFP_KERNEL); if (ret) return ret; ret = percpu_counter_init(&m->d_lease_mis, 0, GFP_KERNEL); if (ret) goto err_d_lease_mis; atomic64_set(&m->total_caps, 0); ret = percpu_counter_init(&m->i_caps_hit, 0, GFP_KERNEL); if (ret) goto err_i_caps_hit; ret = percpu_counter_init(&m->i_caps_mis, 0, GFP_KERNEL); if (ret) goto err_i_caps_mis; for (i = 0; i < METRIC_MAX; i++) { metric = &m->metric[i]; spin_lock_init(&metric->lock); metric->size_sum = 0; metric->size_min = U64_MAX; metric->size_max = 0; metric->total = 0; metric->latency_sum = 0; metric->latency_avg = 0; metric->latency_sq_sum = 0; metric->latency_min = KTIME_MAX; metric->latency_max = 0; } atomic64_set(&m->opened_files, 0); ret = percpu_counter_init(&m->opened_inodes, 0, GFP_KERNEL); if (ret) goto err_opened_inodes; ret = percpu_counter_init(&m->total_inodes, 0, GFP_KERNEL); if (ret) goto err_total_inodes; m->session = NULL; INIT_DELAYED_WORK(&m->delayed_work, metric_delayed_work); return 0; err_total_inodes: percpu_counter_destroy(&m->opened_inodes); err_opened_inodes: percpu_counter_destroy(&m->i_caps_mis); err_i_caps_mis: percpu_counter_destroy(&m->i_caps_hit); err_i_caps_hit: percpu_counter_destroy(&m->d_lease_mis); err_d_lease_mis: percpu_counter_destroy(&m->d_lease_hit); return ret; } void ceph_metric_destroy(struct ceph_client_metric *m) { if (!m) return; cancel_delayed_work_sync(&m->delayed_work); percpu_counter_destroy(&m->total_inodes); percpu_counter_destroy(&m->opened_inodes); percpu_counter_destroy(&m->i_caps_mis); percpu_counter_destroy(&m->i_caps_hit); percpu_counter_destroy(&m->d_lease_mis); percpu_counter_destroy(&m->d_lease_hit); ceph_put_mds_session(m->session); } #define METRIC_UPDATE_MIN_MAX(min, max, new) \ { \ if (unlikely(new < min)) \ min = new; \ if (unlikely(new > max)) \ max = new; \ } static inline void __update_mean_and_stdev(ktime_t total, ktime_t *lavg, ktime_t *sq_sump, ktime_t lat) { ktime_t avg; if (unlikely(total == 1)) { *lavg = lat; } else { /* the sq is (lat - old_avg) * (lat - new_avg) */ avg = *lavg + div64_s64(lat - *lavg, total); *sq_sump += (lat - *lavg)*(lat - avg); *lavg = avg; } } void ceph_update_metrics(struct ceph_metric *m, ktime_t r_start, ktime_t r_end, unsigned int size, int rc) { ktime_t lat = ktime_sub(r_end, r_start); ktime_t total; if (unlikely(rc < 0 && rc != -ENOENT && rc != -ETIMEDOUT)) return; spin_lock(&m->lock); total = ++m->total; m->size_sum += size; METRIC_UPDATE_MIN_MAX(m->size_min, m->size_max, size); m->latency_sum += lat; METRIC_UPDATE_MIN_MAX(m->latency_min, m->latency_max, lat); __update_mean_and_stdev(total, &m->latency_avg, &m->latency_sq_sum, lat); spin_unlock(&m->lock); } |
744 3136 3131 3136 3134 3139 3139 3138 2510 244 740 740 111 3139 3140 3135 3062 352 352 3140 3063 351 3139 3138 3140 3138 749 2650 2648 2650 781 782 747 747 748 400 749 399 748 747 747 400 748 746 748 749 744 749 741 344 741 14 129 401 544 542 542 542 543 544 400 358 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 | // SPDX-License-Identifier: GPL-2.0-only #define pr_fmt(fmt) "SMP alternatives: " fmt #include <linux/module.h> #include <linux/sched.h> #include <linux/perf_event.h> #include <linux/mutex.h> #include <linux/list.h> #include <linux/stringify.h> #include <linux/highmem.h> #include <linux/mm.h> #include <linux/vmalloc.h> #include <linux/memory.h> #include <linux/stop_machine.h> #include <linux/slab.h> #include <linux/kdebug.h> #include <linux/kprobes.h> #include <linux/mmu_context.h> #include <linux/bsearch.h> #include <linux/sync_core.h> #include <asm/text-patching.h> #include <asm/alternative.h> #include <asm/sections.h> #include <asm/mce.h> #include <asm/nmi.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> #include <asm/insn.h> #include <asm/io.h> #include <asm/fixmap.h> #include <asm/paravirt.h> #include <asm/asm-prototypes.h> #include <asm/cfi.h> int __read_mostly alternatives_patched; EXPORT_SYMBOL_GPL(alternatives_patched); #define MAX_PATCH_LEN (255-1) #define DA_ALL (~0) #define DA_ALT 0x01 #define DA_RET 0x02 #define DA_RETPOLINE 0x04 #define DA_ENDBR 0x08 #define DA_SMP 0x10 static unsigned int __initdata_or_module debug_alternative; static int __init debug_alt(char *str) { if (str && *str == '=') str++; if (!str || kstrtouint(str, 0, &debug_alternative)) debug_alternative = DA_ALL; return 1; } __setup("debug-alternative", debug_alt); static int noreplace_smp; static int __init setup_noreplace_smp(char *str) { noreplace_smp = 1; return 1; } __setup("noreplace-smp", setup_noreplace_smp); #define DPRINTK(type, fmt, args...) \ do { \ if (debug_alternative & DA_##type) \ printk(KERN_DEBUG pr_fmt(fmt) "\n", ##args); \ } while (0) #define DUMP_BYTES(type, buf, len, fmt, args...) \ do { \ if (unlikely(debug_alternative & DA_##type)) { \ int j; \ \ if (!(len)) \ break; \ \ printk(KERN_DEBUG pr_fmt(fmt), ##args); \ for (j = 0; j < (len) - 1; j++) \ printk(KERN_CONT "%02hhx ", buf[j]); \ printk(KERN_CONT "%02hhx\n", buf[j]); \ } \ } while (0) static const unsigned char x86nops[] = { BYTES_NOP1, BYTES_NOP2, BYTES_NOP3, BYTES_NOP4, BYTES_NOP5, BYTES_NOP6, BYTES_NOP7, BYTES_NOP8, #ifdef CONFIG_64BIT BYTES_NOP9, BYTES_NOP10, BYTES_NOP11, #endif }; const unsigned char * const x86_nops[ASM_NOP_MAX+1] = { NULL, x86nops, x86nops + 1, x86nops + 1 + 2, x86nops + 1 + 2 + 3, x86nops + 1 + 2 + 3 + 4, x86nops + 1 + 2 + 3 + 4 + 5, x86nops + 1 + 2 + 3 + 4 + 5 + 6, x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, #ifdef CONFIG_64BIT x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8, x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9, x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10, #endif }; /* * Fill the buffer with a single effective instruction of size @len. * * In order not to issue an ORC stack depth tracking CFI entry (Call Frame Info) * for every single-byte NOP, try to generate the maximally available NOP of * size <= ASM_NOP_MAX such that only a single CFI entry is generated (vs one for * each single-byte NOPs). If @len to fill out is > ASM_NOP_MAX, pad with INT3 and * *jump* over instead of executing long and daft NOPs. */ static void __init_or_module add_nop(u8 *instr, unsigned int len) { u8 *target = instr + len; if (!len) return; if (len <= ASM_NOP_MAX) { memcpy(instr, x86_nops[len], len); return; } if (len < 128) { __text_gen_insn(instr, JMP8_INSN_OPCODE, instr, target, JMP8_INSN_SIZE); instr += JMP8_INSN_SIZE; } else { __text_gen_insn(instr, JMP32_INSN_OPCODE, instr, target, JMP32_INSN_SIZE); instr += JMP32_INSN_SIZE; } for (;instr < target; instr++) *instr = INT3_INSN_OPCODE; } extern s32 __retpoline_sites[], __retpoline_sites_end[]; extern s32 __return_sites[], __return_sites_end[]; extern s32 __cfi_sites[], __cfi_sites_end[]; extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[]; extern s32 __smp_locks[], __smp_locks_end[]; void text_poke_early(void *addr, const void *opcode, size_t len); /* * Matches NOP and NOPL, not any of the other possible NOPs. */ static bool insn_is_nop(struct insn *insn) { /* Anything NOP, but no REP NOP */ if (insn->opcode.bytes[0] == 0x90 && (!insn->prefixes.nbytes || insn->prefixes.bytes[0] != 0xF3)) return true; /* NOPL */ if (insn->opcode.bytes[0] == 0x0F && insn->opcode.bytes[1] == 0x1F) return true; /* TODO: more nops */ return false; } /* * Find the offset of the first non-NOP instruction starting at @offset * but no further than @len. */ static int skip_nops(u8 *instr, int offset, int len) { struct insn insn; for (; offset < len; offset += insn.length) { if (insn_decode_kernel(&insn, &instr[offset])) break; if (!insn_is_nop(&insn)) break; } return offset; } /* * Optimize a sequence of NOPs, possibly preceded by an unconditional jump * to the end of the NOP sequence into a single NOP. */ static bool __init_or_module __optimize_nops(u8 *instr, size_t len, struct insn *insn, int *next, int *prev, int *target) { int i = *next - insn->length; switch (insn->opcode.bytes[0]) { case JMP8_INSN_OPCODE: case JMP32_INSN_OPCODE: *prev = i; *target = *next + insn->immediate.value; return false; } if (insn_is_nop(insn)) { int nop = i; *next = skip_nops(instr, *next, len); if (*target && *next == *target) nop = *prev; add_nop(instr + nop, *next - nop); DUMP_BYTES(ALT, instr, len, "%px: [%d:%d) optimized NOPs: ", instr, nop, *next); return true; } *target = 0; return false; } /* * "noinline" to cause control flow change and thus invalidate I$ and * cause refetch after modification. */ static void __init_or_module noinline optimize_nops(u8 *instr, size_t len) { int prev, target = 0; for (int next, i = 0; i < len; i = next) { struct insn insn; if (insn_decode_kernel(&insn, &instr[i])) return; next = i + insn.length; __optimize_nops(instr, len, &insn, &next, &prev, &target); } } static void __init_or_module noinline optimize_nops_inplace(u8 *instr, size_t len) { unsigned long flags; local_irq_save(flags); optimize_nops(instr, len); sync_core(); local_irq_restore(flags); } /* * In this context, "source" is where the instructions are placed in the * section .altinstr_replacement, for example during kernel build by the * toolchain. * "Destination" is where the instructions are being patched in by this * machinery. * * The source offset is: * * src_imm = target - src_next_ip (1) * * and the target offset is: * * dst_imm = target - dst_next_ip (2) * * so rework (1) as an expression for target like: * * target = src_imm + src_next_ip (1a) * * and substitute in (2) to get: * * dst_imm = (src_imm + src_next_ip) - dst_next_ip (3) * * Now, since the instruction stream is 'identical' at src and dst (it * is being copied after all) it can be stated that: * * src_next_ip = src + ip_offset * dst_next_ip = dst + ip_offset (4) * * Substitute (4) in (3) and observe ip_offset being cancelled out to * obtain: * * dst_imm = src_imm + (src + ip_offset) - (dst + ip_offset) * = src_imm + src - dst + ip_offset - ip_offset * = src_imm + src - dst (5) * * IOW, only the relative displacement of the code block matters. */ #define apply_reloc_n(n_, p_, d_) \ do { \ s32 v = *(s##n_ *)(p_); \ v += (d_); \ BUG_ON((v >> 31) != (v >> (n_-1))); \ *(s##n_ *)(p_) = (s##n_)v; \ } while (0) static __always_inline void apply_reloc(int n, void *ptr, uintptr_t diff) { switch (n) { case 1: apply_reloc_n(8, ptr, diff); break; case 2: apply_reloc_n(16, ptr, diff); break; case 4: apply_reloc_n(32, ptr, diff); break; default: BUG(); } } static __always_inline bool need_reloc(unsigned long offset, u8 *src, size_t src_len) { u8 *target = src + offset; /* * If the target is inside the patched block, it's relative to the * block itself and does not need relocation. */ return (target < src || target > src + src_len); } static void __init_or_module noinline apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len) { int prev, target = 0; for (int next, i = 0; i < len; i = next) { struct insn insn; if (WARN_ON_ONCE(insn_decode_kernel(&insn, &buf[i]))) return; next = i + insn.length; if (__optimize_nops(buf, len, &insn, &next, &prev, &target)) continue; switch (insn.opcode.bytes[0]) { case 0x0f: if (insn.opcode.bytes[1] < 0x80 || insn.opcode.bytes[1] > 0x8f) break; fallthrough; /* Jcc.d32 */ case 0x70 ... 0x7f: /* Jcc.d8 */ case JMP8_INSN_OPCODE: case JMP32_INSN_OPCODE: case CALL_INSN_OPCODE: if (need_reloc(next + insn.immediate.value, src, src_len)) { apply_reloc(insn.immediate.nbytes, buf + i + insn_offset_immediate(&insn), src - dest); } /* * Where possible, convert JMP.d32 into JMP.d8. */ if (insn.opcode.bytes[0] == JMP32_INSN_OPCODE) { s32 imm = insn.immediate.value; imm += src - dest; imm += JMP32_INSN_SIZE - JMP8_INSN_SIZE; if ((imm >> 31) == (imm >> 7)) { buf[i+0] = JMP8_INSN_OPCODE; buf[i+1] = (s8)imm; memset(&buf[i+2], INT3_INSN_OPCODE, insn.length - 2); } } break; } if (insn_rip_relative(&insn)) { if (need_reloc(next + insn.displacement.value, src, src_len)) { apply_reloc(insn.displacement.nbytes, buf + i + insn_offset_displacement(&insn), src - dest); } } } } /* Low-level backend functions usable from alternative code replacements. */ DEFINE_ASM_FUNC(nop_func, "", .entry.text); EXPORT_SYMBOL_GPL(nop_func); noinstr void BUG_func(void) { BUG(); } EXPORT_SYMBOL(BUG_func); #define CALL_RIP_REL_OPCODE 0xff #define CALL_RIP_REL_MODRM 0x15 /* * Rewrite the "call BUG_func" replacement to point to the target of the * indirect pv_ops call "call *disp(%ip)". */ static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a) { void *target, *bug = &BUG_func; s32 disp; if (a->replacementlen != 5 || insn_buff[0] != CALL_INSN_OPCODE) { pr_err("ALT_FLAG_DIRECT_CALL set for a non-call replacement instruction\n"); BUG(); } if (a->instrlen != 6 || instr[0] != CALL_RIP_REL_OPCODE || instr[1] != CALL_RIP_REL_MODRM) { pr_err("ALT_FLAG_DIRECT_CALL set for unrecognized indirect call\n"); BUG(); } /* Skip CALL_RIP_REL_OPCODE and CALL_RIP_REL_MODRM */ disp = *(s32 *)(instr + 2); #ifdef CONFIG_X86_64 /* ff 15 00 00 00 00 call *0x0(%rip) */ /* target address is stored at "next instruction + disp". */ target = *(void **)(instr + a->instrlen + disp); #else /* ff 15 00 00 00 00 call *0x0 */ /* target address is stored at disp. */ target = *(void **)disp; #endif if (!target) target = bug; /* (BUG_func - .) + (target - BUG_func) := target - . */ *(s32 *)(insn_buff + 1) += target - bug; if (target == &nop_func) return 0; return 5; } /* * Replace instructions with better alternatives for this CPU type. This runs * before SMP is initialized to avoid SMP problems with self modifying code. * This implies that asymmetric systems where APs have less capabilities than * the boot processor are not handled. Tough. Make sure you disable such * features by hand. * * Marked "noinline" to cause control flow change and thus insn cache * to refetch changed I$ lines. */ void __init_or_module noinline apply_alternatives(struct alt_instr *start, struct alt_instr *end) { struct alt_instr *a; u8 *instr, *replacement; u8 insn_buff[MAX_PATCH_LEN]; DPRINTK(ALT, "alt table %px, -> %px", start, end); /* * In the case CONFIG_X86_5LEVEL=y, KASAN_SHADOW_START is defined using * cpu_feature_enabled(X86_FEATURE_LA57) and is therefore patched here. * During the process, KASAN becomes confused seeing partial LA57 * conversion and triggers a false-positive out-of-bound report. * * Disable KASAN until the patching is complete. */ kasan_disable_current(); /* * The scan order should be from start to end. A later scanned * alternative code can overwrite previously scanned alternative code. * Some kernel functions (e.g. memcpy, memset, etc) use this order to * patch code. * * So be careful if you want to change the scan order to any other * order. */ for (a = start; a < end; a++) { int insn_buff_sz = 0; instr = (u8 *)&a->instr_offset + a->instr_offset; replacement = (u8 *)&a->repl_offset + a->repl_offset; BUG_ON(a->instrlen > sizeof(insn_buff)); BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32); /* * Patch if either: * - feature is present * - feature not present but ALT_FLAG_NOT is set to mean, * patch if feature is *NOT* present. */ if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) { optimize_nops_inplace(instr, a->instrlen); continue; } DPRINTK(ALT, "feat: %d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d) flags: 0x%x", a->cpuid >> 5, a->cpuid & 0x1f, instr, instr, a->instrlen, replacement, a->replacementlen, a->flags); memcpy(insn_buff, replacement, a->replacementlen); insn_buff_sz = a->replacementlen; if (a->flags & ALT_FLAG_DIRECT_CALL) { insn_buff_sz = alt_replace_call(instr, insn_buff, a); if (insn_buff_sz < 0) continue; } for (; insn_buff_sz < a->instrlen; insn_buff_sz++) insn_buff[insn_buff_sz] = 0x90; apply_relocation(insn_buff, a->instrlen, instr, replacement, a->replacementlen); DUMP_BYTES(ALT, instr, a->instrlen, "%px: old_insn: ", instr); DUMP_BYTES(ALT, replacement, a->replacementlen, "%px: rpl_insn: ", replacement); DUMP_BYTES(ALT, insn_buff, insn_buff_sz, "%px: final_insn: ", instr); text_poke_early(instr, insn_buff, insn_buff_sz); } kasan_enable_current(); } static inline bool is_jcc32(struct insn *insn) { /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */ return insn->opcode.bytes[0] == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80; } #if defined(CONFIG_RETPOLINE) && defined(CONFIG_OBJTOOL) /* * CALL/JMP *%\reg */ static int emit_indirect(int op, int reg, u8 *bytes) { int i = 0; u8 modrm; switch (op) { case CALL_INSN_OPCODE: modrm = 0x10; /* Reg = 2; CALL r/m */ break; case JMP32_INSN_OPCODE: modrm = 0x20; /* Reg = 4; JMP r/m */ break; default: WARN_ON_ONCE(1); return -1; } if (reg >= 8) { bytes[i++] = 0x41; /* REX.B prefix */ reg -= 8; } modrm |= 0xc0; /* Mod = 3 */ modrm += reg; bytes[i++] = 0xff; /* opcode */ bytes[i++] = modrm; return i; } static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 *bytes) { u8 op = insn->opcode.bytes[0]; int i = 0; /* * Clang does 'weird' Jcc __x86_indirect_thunk_r11 conditional * tail-calls. Deal with them. */ if (is_jcc32(insn)) { bytes[i++] = op; op = insn->opcode.bytes[1]; goto clang_jcc; } if (insn->length == 6) bytes[i++] = 0x2e; /* CS-prefix */ switch (op) { case CALL_INSN_OPCODE: __text_gen_insn(bytes+i, op, addr+i, __x86_indirect_call_thunk_array[reg], CALL_INSN_SIZE); i += CALL_INSN_SIZE; break; case JMP32_INSN_OPCODE: clang_jcc: __text_gen_insn(bytes+i, op, addr+i, __x86_indirect_jump_thunk_array[reg], JMP32_INSN_SIZE); i += JMP32_INSN_SIZE; break; default: WARN(1, "%pS %px %*ph\n", addr, addr, 6, addr); return -1; } WARN_ON_ONCE(i != insn->length); return i; } /* * Rewrite the compiler generated retpoline thunk calls. * * For spectre_v2=off (!X86_FEATURE_RETPOLINE), rewrite them into immediate * indirect instructions, avoiding the extra indirection. * * For example, convert: * * CALL __x86_indirect_thunk_\reg * * into: * * CALL *%\reg * * It also tries to inline spectre_v2=retpoline,lfence when size permits. */ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) { retpoline_thunk_t *target; int reg, ret, i = 0; u8 op, cc; target = addr + insn->length + insn->immediate.value; reg = target - __x86_indirect_thunk_array; if (WARN_ON_ONCE(reg & ~0xf)) return -1; /* If anyone ever does: CALL/JMP *%rsp, we're in deep trouble. */ BUG_ON(reg == 4); if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) && !cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) { if (cpu_feature_enabled(X86_FEATURE_CALL_DEPTH)) return emit_call_track_retpoline(addr, insn, reg, bytes); return -1; } op = insn->opcode.bytes[0]; /* * Convert: * * Jcc.d32 __x86_indirect_thunk_\reg * * into: * * Jncc.d8 1f * [ LFENCE ] * JMP *%\reg * [ NOP ] * 1: */ if (is_jcc32(insn)) { cc = insn->opcode.bytes[1] & 0xf; cc ^= 1; /* invert condition */ bytes[i++] = 0x70 + cc; /* Jcc.d8 */ bytes[i++] = insn->length - 2; /* sizeof(Jcc.d8) == 2 */ /* Continue as if: JMP.d32 __x86_indirect_thunk_\reg */ op = JMP32_INSN_OPCODE; } /* * For RETPOLINE_LFENCE: prepend the indirect CALL/JMP with an LFENCE. */ if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) { bytes[i++] = 0x0f; bytes[i++] = 0xae; bytes[i++] = 0xe8; /* LFENCE */ } ret = emit_indirect(op, reg, bytes + i); if (ret < 0) return ret; i += ret; /* * The compiler is supposed to EMIT an INT3 after every unconditional * JMP instruction due to AMD BTC. However, if the compiler is too old * or SLS isn't enabled, we still need an INT3 after indirect JMPs * even on Intel. */ if (op == JMP32_INSN_OPCODE && i < insn->length) bytes[i++] = INT3_INSN_OPCODE; for (; i < insn->length;) bytes[i++] = BYTES_NOP1; return i; } /* * Generated by 'objtool --retpoline'. */ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; struct insn insn; int len, ret; u8 bytes[16]; u8 op1, op2; ret = insn_decode_kernel(&insn, addr); if (WARN_ON_ONCE(ret < 0)) continue; op1 = insn.opcode.bytes[0]; op2 = insn.opcode.bytes[1]; switch (op1) { case CALL_INSN_OPCODE: case JMP32_INSN_OPCODE: break; case 0x0f: /* escape */ if (op2 >= 0x80 && op2 <= 0x8f) break; fallthrough; default: WARN_ON_ONCE(1); continue; } DPRINTK(RETPOLINE, "retpoline at: %pS (%px) len: %d to: %pS", addr, addr, insn.length, addr + insn.length + insn.immediate.value); len = patch_retpoline(addr, &insn, bytes); if (len == insn.length) { optimize_nops(bytes, len); DUMP_BYTES(RETPOLINE, ((u8*)addr), len, "%px: orig: ", addr); DUMP_BYTES(RETPOLINE, ((u8*)bytes), len, "%px: repl: ", addr); text_poke_early(addr, bytes, len); } } } #ifdef CONFIG_RETHUNK /* * Rewrite the compiler generated return thunk tail-calls. * * For example, convert: * * JMP __x86_return_thunk * * into: * * RET */ static int patch_return(void *addr, struct insn *insn, u8 *bytes) { int i = 0; /* Patch the custom return thunks... */ if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) { i = JMP32_INSN_SIZE; __text_gen_insn(bytes, JMP32_INSN_OPCODE, addr, x86_return_thunk, i); } else { /* ... or patch them out if not needed. */ bytes[i++] = RET_INSN_OPCODE; } for (; i < insn->length;) bytes[i++] = INT3_INSN_OPCODE; return i; } void __init_or_module noinline apply_returns(s32 *start, s32 *end) { s32 *s; if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) static_call_force_reinit(); for (s = start; s < end; s++) { void *dest = NULL, *addr = (void *)s + *s; struct insn insn; int len, ret; u8 bytes[16]; u8 op; ret = insn_decode_kernel(&insn, addr); if (WARN_ON_ONCE(ret < 0)) continue; op = insn.opcode.bytes[0]; if (op == JMP32_INSN_OPCODE) dest = addr + insn.length + insn.immediate.value; if (__static_call_fixup(addr, op, dest) || WARN_ONCE(dest != &__x86_return_thunk, "missing return thunk: %pS-%pS: %*ph", addr, dest, 5, addr)) continue; DPRINTK(RET, "return thunk at: %pS (%px) len: %d to: %pS", addr, addr, insn.length, addr + insn.length + insn.immediate.value); len = patch_return(addr, &insn, bytes); if (len == insn.length) { DUMP_BYTES(RET, ((u8*)addr), len, "%px: orig: ", addr); DUMP_BYTES(RET, ((u8*)bytes), len, "%px: repl: ", addr); text_poke_early(addr, bytes, len); } } } #else void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } #endif /* CONFIG_RETHUNK */ #else /* !CONFIG_RETPOLINE || !CONFIG_OBJTOOL */ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { } void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } #endif /* CONFIG_RETPOLINE && CONFIG_OBJTOOL */ #ifdef CONFIG_X86_KERNEL_IBT static void poison_cfi(void *addr); static void __init_or_module poison_endbr(void *addr, bool warn) { u32 endbr, poison = gen_endbr_poison(); if (WARN_ON_ONCE(get_kernel_nofault(endbr, addr))) return; if (!is_endbr(endbr)) { WARN_ON_ONCE(warn); return; } DPRINTK(ENDBR, "ENDBR at: %pS (%px)", addr, addr); /* * When we have IBT, the lack of ENDBR will trigger #CP */ DUMP_BYTES(ENDBR, ((u8*)addr), 4, "%px: orig: ", addr); DUMP_BYTES(ENDBR, ((u8*)&poison), 4, "%px: repl: ", addr); text_poke_early(addr, &poison, 4); } /* * Generated by: objtool --ibt * * Seal the functions for indirect calls by clobbering the ENDBR instructions * and the kCFI hash value. */ void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; poison_endbr(addr, true); if (IS_ENABLED(CONFIG_FINEIBT)) poison_cfi(addr - 16); } } #else void __init_or_module apply_seal_endbr(s32 *start, s32 *end) { } #endif /* CONFIG_X86_KERNEL_IBT */ #ifdef CONFIG_FINEIBT #define __CFI_DEFAULT CFI_DEFAULT #elif defined(CONFIG_CFI_CLANG) #define __CFI_DEFAULT CFI_KCFI #else #define __CFI_DEFAULT CFI_OFF #endif enum cfi_mode cfi_mode __ro_after_init = __CFI_DEFAULT; #ifdef CONFIG_CFI_CLANG struct bpf_insn; /* Must match bpf_func_t / DEFINE_BPF_PROG_RUN() */ extern unsigned int __bpf_prog_runX(const void *ctx, const struct bpf_insn *insn); /* * Force a reference to the external symbol so the compiler generates * __kcfi_typid. */ __ADDRESSABLE(__bpf_prog_runX); /* u32 __ro_after_init cfi_bpf_hash = __kcfi_typeid___bpf_prog_runX; */ asm ( " .pushsection .data..ro_after_init,\"aw\",@progbits \n" " .type cfi_bpf_hash,@object \n" " .globl cfi_bpf_hash \n" " .p2align 2, 0x0 \n" "cfi_bpf_hash: \n" " .long __kcfi_typeid___bpf_prog_runX \n" " .size cfi_bpf_hash, 4 \n" " .popsection \n" ); /* Must match bpf_callback_t */ extern u64 __bpf_callback_fn(u64, u64, u64, u64, u64); __ADDRESSABLE(__bpf_callback_fn); /* u32 __ro_after_init cfi_bpf_subprog_hash = __kcfi_typeid___bpf_callback_fn; */ asm ( " .pushsection .data..ro_after_init,\"aw\",@progbits \n" " .type cfi_bpf_subprog_hash,@object \n" " .globl cfi_bpf_subprog_hash \n" " .p2align 2, 0x0 \n" "cfi_bpf_subprog_hash: \n" " .long __kcfi_typeid___bpf_callback_fn \n" " .size cfi_bpf_subprog_hash, 4 \n" " .popsection \n" ); u32 cfi_get_func_hash(void *func) { u32 hash; func -= cfi_get_offset(); switch (cfi_mode) { case CFI_FINEIBT: func += 7; break; case CFI_KCFI: func += 1; break; default: return 0; } if (get_kernel_nofault(hash, func)) return 0; return hash; } #endif #ifdef CONFIG_FINEIBT static bool cfi_rand __ro_after_init = true; static u32 cfi_seed __ro_after_init; /* * Re-hash the CFI hash with a boot-time seed while making sure the result is * not a valid ENDBR instruction. */ static u32 cfi_rehash(u32 hash) { hash ^= cfi_seed; while (unlikely(is_endbr(hash) || is_endbr(-hash))) { bool lsb = hash & 1; hash >>= 1; if (lsb) hash ^= 0x80200003; } return hash; } static __init int cfi_parse_cmdline(char *str) { if (!str) return -EINVAL; while (str) { char *next = strchr(str, ','); if (next) { *next = 0; next++; } if (!strcmp(str, "auto")) { cfi_mode = CFI_DEFAULT; } else if (!strcmp(str, "off")) { cfi_mode = CFI_OFF; cfi_rand = false; } else if (!strcmp(str, "kcfi")) { cfi_mode = CFI_KCFI; } else if (!strcmp(str, "fineibt")) { cfi_mode = CFI_FINEIBT; } else if (!strcmp(str, "norand")) { cfi_rand = false; } else { pr_err("Ignoring unknown cfi option (%s).", str); } str = next; } return 0; } early_param("cfi", cfi_parse_cmdline); /* * kCFI FineIBT * * __cfi_\func: __cfi_\func: * movl $0x12345678,%eax // 5 endbr64 // 4 * nop subl $0x12345678,%r10d // 7 * nop jz 1f // 2 * nop ud2 // 2 * nop 1: nop // 1 * nop * nop * nop * nop * nop * nop * nop * * * caller: caller: * movl $(-0x12345678),%r10d // 6 movl $0x12345678,%r10d // 6 * addl $-15(%r11),%r10d // 4 sub $16,%r11 // 4 * je 1f // 2 nop4 // 4 * ud2 // 2 * 1: call __x86_indirect_thunk_r11 // 5 call *%r11; nop2; // 5 * */ asm( ".pushsection .rodata \n" "fineibt_preamble_start: \n" " endbr64 \n" " subl $0x12345678, %r10d \n" " je fineibt_preamble_end \n" " ud2 \n" " nop \n" "fineibt_preamble_end: \n" ".popsection\n" ); extern u8 fineibt_preamble_start[]; extern u8 fineibt_preamble_end[]; #define fineibt_preamble_size (fineibt_preamble_end - fineibt_preamble_start) #define fineibt_preamble_hash 7 asm( ".pushsection .rodata \n" "fineibt_caller_start: \n" " movl $0x12345678, %r10d \n" " sub $16, %r11 \n" ASM_NOP4 "fineibt_caller_end: \n" ".popsection \n" ); extern u8 fineibt_caller_start[]; extern u8 fineibt_caller_end[]; #define fineibt_caller_size (fineibt_caller_end - fineibt_caller_start) #define fineibt_caller_hash 2 #define fineibt_caller_jmp (fineibt_caller_size - 2) static u32 decode_preamble_hash(void *addr) { u8 *p = addr; /* b8 78 56 34 12 mov $0x12345678,%eax */ if (p[0] == 0xb8) return *(u32 *)(addr + 1); return 0; /* invalid hash value */ } static u32 decode_caller_hash(void *addr) { u8 *p = addr; /* 41 ba 78 56 34 12 mov $0x12345678,%r10d */ if (p[0] == 0x41 && p[1] == 0xba) return -*(u32 *)(addr + 2); /* e8 0c 78 56 34 12 jmp.d8 +12 */ if (p[0] == JMP8_INSN_OPCODE && p[1] == fineibt_caller_jmp) return -*(u32 *)(addr + 2); return 0; /* invalid hash value */ } /* .retpoline_sites */ static int cfi_disable_callers(s32 *start, s32 *end) { /* * Disable kCFI by patching in a JMP.d8, this leaves the hash immediate * in tact for later usage. Also see decode_caller_hash() and * cfi_rewrite_callers(). */ const u8 jmp[] = { JMP8_INSN_OPCODE, fineibt_caller_jmp }; s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; u32 hash; addr -= fineibt_caller_size; hash = decode_caller_hash(addr); if (!hash) /* nocfi callers */ continue; text_poke_early(addr, jmp, 2); } return 0; } static int cfi_enable_callers(s32 *start, s32 *end) { /* * Re-enable kCFI, undo what cfi_disable_callers() did. */ const u8 mov[] = { 0x41, 0xba }; s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; u32 hash; addr -= fineibt_caller_size; hash = decode_caller_hash(addr); if (!hash) /* nocfi callers */ continue; text_poke_early(addr, mov, 2); } return 0; } /* .cfi_sites */ static int cfi_rand_preamble(s32 *start, s32 *end) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; u32 hash; hash = decode_preamble_hash(addr); if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n", addr, addr, 5, addr)) return -EINVAL; hash = cfi_rehash(hash); text_poke_early(addr + 1, &hash, 4); } return 0; } static int cfi_rewrite_preamble(s32 *start, s32 *end) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; u32 hash; hash = decode_preamble_hash(addr); if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n", addr, addr, 5, addr)) return -EINVAL; text_poke_early(addr, fineibt_preamble_start, fineibt_preamble_size); WARN_ON(*(u32 *)(addr + fineibt_preamble_hash) != 0x12345678); text_poke_early(addr + fineibt_preamble_hash, &hash, 4); } return 0; } static void cfi_rewrite_endbr(s32 *start, s32 *end) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; poison_endbr(addr+16, false); } } /* .retpoline_sites */ static int cfi_rand_callers(s32 *start, s32 *end) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; u32 hash; addr -= fineibt_caller_size; hash = decode_caller_hash(addr); if (hash) { hash = -cfi_rehash(hash); text_poke_early(addr + 2, &hash, 4); } } return 0; } static int cfi_rewrite_callers(s32 *start, s32 *end) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; u32 hash; addr -= fineibt_caller_size; hash = decode_caller_hash(addr); if (hash) { text_poke_early(addr, fineibt_caller_start, fineibt_caller_size); WARN_ON(*(u32 *)(addr + fineibt_caller_hash) != 0x12345678); text_poke_early(addr + fineibt_caller_hash, &hash, 4); } /* rely on apply_retpolines() */ } return 0; } static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, s32 *start_cfi, s32 *end_cfi, bool builtin) { int ret; if (WARN_ONCE(fineibt_preamble_size != 16, "FineIBT preamble wrong size: %ld", fineibt_preamble_size)) return; if (cfi_mode == CFI_DEFAULT) { cfi_mode = CFI_KCFI; if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT)) cfi_mode = CFI_FINEIBT; } /* * Rewrite the callers to not use the __cfi_ stubs, such that we might * rewrite them. This disables all CFI. If this succeeds but any of the * later stages fails, we're without CFI. */ ret = cfi_disable_callers(start_retpoline, end_retpoline); if (ret) goto err; if (cfi_rand) { if (builtin) { cfi_seed = get_random_u32(); cfi_bpf_hash = cfi_rehash(cfi_bpf_hash); cfi_bpf_subprog_hash = cfi_rehash(cfi_bpf_subprog_hash); } ret = cfi_rand_preamble(start_cfi, end_cfi); if (ret) goto err; ret = cfi_rand_callers(start_retpoline, end_retpoline); if (ret) goto err; } switch (cfi_mode) { case CFI_OFF: if (builtin) pr_info("Disabling CFI\n"); return; case CFI_KCFI: ret = cfi_enable_callers(start_retpoline, end_retpoline); if (ret) goto err; if (builtin) pr_info("Using kCFI\n"); return; case CFI_FINEIBT: /* place the FineIBT preamble at func()-16 */ ret = cfi_rewrite_preamble(start_cfi, end_cfi); if (ret) goto err; /* rewrite the callers to target func()-16 */ ret = cfi_rewrite_callers(start_retpoline, end_retpoline); if (ret) goto err; /* now that nobody targets func()+0, remove ENDBR there */ cfi_rewrite_endbr(start_cfi, end_cfi); if (builtin) pr_info("Using FineIBT CFI\n"); return; default: break; } err: pr_err("Something went horribly wrong trying to rewrite the CFI implementation.\n"); } static inline void poison_hash(void *addr) { *(u32 *)addr = 0; } static void poison_cfi(void *addr) { switch (cfi_mode) { case CFI_FINEIBT: /* * __cfi_\func: * osp nopl (%rax) * subl $0, %r10d * jz 1f * ud2 * 1: nop */ poison_endbr(addr, false); poison_hash(addr + fineibt_preamble_hash); break; case CFI_KCFI: /* * __cfi_\func: * movl $0, %eax * .skip 11, 0x90 */ poison_hash(addr + 1); break; default: break; } } #else static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, s32 *start_cfi, s32 *end_cfi, bool builtin) { } #ifdef CONFIG_X86_KERNEL_IBT static void poison_cfi(void *addr) { } #endif #endif void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, s32 *start_cfi, s32 *end_cfi) { return __apply_fineibt(start_retpoline, end_retpoline, start_cfi, end_cfi, /* .builtin = */ false); } #ifdef CONFIG_SMP static void alternatives_smp_lock(const s32 *start, const s32 *end, u8 *text, u8 *text_end) { const s32 *poff; for (poff = start; poff < end; poff++) { u8 *ptr = (u8 *)poff + *poff; if (!*poff || ptr < text || ptr >= text_end) continue; /* turn DS segment override prefix into lock prefix */ if (*ptr == 0x3e) text_poke(ptr, ((unsigned char []){0xf0}), 1); } } static void alternatives_smp_unlock(const s32 *start, const s32 *end, u8 *text, u8 *text_end) { const s32 *poff; for (poff = start; poff < end; poff++) { u8 *ptr = (u8 *)poff + *poff; if (!*poff || ptr < text || ptr >= text_end) continue; /* turn lock prefix into DS segment override prefix */ if (*ptr == 0xf0) text_poke(ptr, ((unsigned char []){0x3E}), 1); } } struct smp_alt_module { /* what is this ??? */ struct module *mod; char *name; /* ptrs to lock prefixes */ const s32 *locks; const s32 *locks_end; /* .text segment, needed to avoid patching init code ;) */ u8 *text; u8 *text_end; struct list_head next; }; static LIST_HEAD(smp_alt_modules); static bool uniproc_patched = false; /* protected by text_mutex */ void __init_or_module alternatives_smp_module_add(struct module *mod, char *name, void *locks, void *locks_end, void *text, void *text_end) { struct smp_alt_module *smp; mutex_lock(&text_mutex); if (!uniproc_patched) goto unlock; if (num_possible_cpus() == 1) /* Don't bother remembering, we'll never have to undo it. */ goto smp_unlock; smp = kzalloc(sizeof(*smp), GFP_KERNEL); if (NULL == smp) /* we'll run the (safe but slow) SMP code then ... */ goto unlock; smp->mod = mod; smp->name = name; smp->locks = locks; smp->locks_end = locks_end; smp->text = text; smp->text_end = text_end; DPRINTK(SMP, "locks %p -> %p, text %p -> %p, name %s\n", smp->locks, smp->locks_end, smp->text, smp->text_end, smp->name); list_add_tail(&smp->next, &smp_alt_modules); smp_unlock: alternatives_smp_unlock(locks, locks_end, text, text_end); unlock: mutex_unlock(&text_mutex); } void __init_or_module alternatives_smp_module_del(struct module *mod) { struct smp_alt_module *item; mutex_lock(&text_mutex); list_for_each_entry(item, &smp_alt_modules, next) { if (mod != item->mod) continue; list_del(&item->next); kfree(item); break; } mutex_unlock(&text_mutex); } void alternatives_enable_smp(void) { struct smp_alt_module *mod; /* Why bother if there are no other CPUs? */ BUG_ON(num_possible_cpus() == 1); mutex_lock(&text_mutex); if (uniproc_patched) { pr_info("switching to SMP code\n"); BUG_ON(num_online_cpus() != 1); clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP); list_for_each_entry(mod, &smp_alt_modules, next) alternatives_smp_lock(mod->locks, mod->locks_end, mod->text, mod->text_end); uniproc_patched = false; } mutex_unlock(&text_mutex); } /* * Return 1 if the address range is reserved for SMP-alternatives. * Must hold text_mutex. */ int alternatives_text_reserved(void *start, void *end) { struct smp_alt_module *mod; const s32 *poff; u8 *text_start = start; u8 *text_end = end; lockdep_assert_held(&text_mutex); list_for_each_entry(mod, &smp_alt_modules, next) { if (mod->text > text_end || mod->text_end < text_start) continue; for (poff = mod->locks; poff < mod->locks_end; poff++) { const u8 *ptr = (const u8 *)poff + *poff; if (text_start <= ptr && text_end > ptr) return 1; } } return 0; } #endif /* CONFIG_SMP */ /* * Self-test for the INT3 based CALL emulation code. * * This exercises int3_emulate_call() to make sure INT3 pt_regs are set up * properly and that there is a stack gap between the INT3 frame and the * previous context. Without this gap doing a virtual PUSH on the interrupted * stack would corrupt the INT3 IRET frame. * * See entry_{32,64}.S for more details. */ /* * We define the int3_magic() function in assembly to control the calling * convention such that we can 'call' it from assembly. */ extern void int3_magic(unsigned int *ptr); /* defined in asm */ asm ( " .pushsection .init.text, \"ax\", @progbits\n" " .type int3_magic, @function\n" "int3_magic:\n" ANNOTATE_NOENDBR " movl $1, (%" _ASM_ARG1 ")\n" ASM_RET " .size int3_magic, .-int3_magic\n" " .popsection\n" ); extern void int3_selftest_ip(void); /* defined in asm below */ static int __init int3_exception_notify(struct notifier_block *self, unsigned long val, void *data) { unsigned long selftest = (unsigned long)&int3_selftest_ip; struct die_args *args = data; struct pt_regs *regs = args->regs; OPTIMIZER_HIDE_VAR(selftest); if (!regs || user_mode(regs)) return NOTIFY_DONE; if (val != DIE_INT3) return NOTIFY_DONE; if (regs->ip - INT3_INSN_SIZE != selftest) return NOTIFY_DONE; int3_emulate_call(regs, (unsigned long)&int3_magic); return NOTIFY_STOP; } /* Must be noinline to ensure uniqueness of int3_selftest_ip. */ static noinline void __init int3_selftest(void) { static __initdata struct notifier_block int3_exception_nb = { .notifier_call = int3_exception_notify, .priority = INT_MAX-1, /* last */ }; unsigned int val = 0; BUG_ON(register_die_notifier(&int3_exception_nb)); /* * Basically: int3_magic(&val); but really complicated :-) * * INT3 padded with NOP to CALL_INSN_SIZE. The int3_exception_nb * notifier above will emulate CALL for us. */ asm volatile ("int3_selftest_ip:\n\t" ANNOTATE_NOENDBR " int3; nop; nop; nop; nop\n\t" : ASM_CALL_CONSTRAINT : __ASM_SEL_RAW(a, D) (&val) : "memory"); BUG_ON(val != 1); unregister_die_notifier(&int3_exception_nb); } static __initdata int __alt_reloc_selftest_addr; extern void __init __alt_reloc_selftest(void *arg); __visible noinline void __init __alt_reloc_selftest(void *arg) { WARN_ON(arg != &__alt_reloc_selftest_addr); } static noinline void __init alt_reloc_selftest(void) { /* * Tests apply_relocation(). * * This has a relative immediate (CALL) in a place other than the first * instruction and additionally on x86_64 we get a RIP-relative LEA: * * lea 0x0(%rip),%rdi # 5d0: R_X86_64_PC32 .init.data+0x5566c * call +0 # 5d5: R_X86_64_PLT32 __alt_reloc_selftest-0x4 * * Getting this wrong will either crash and burn or tickle the WARN * above. */ asm_inline volatile ( ALTERNATIVE("", "lea %[mem], %%" _ASM_ARG1 "; call __alt_reloc_selftest;", X86_FEATURE_ALWAYS) : /* output */ : [mem] "m" (__alt_reloc_selftest_addr) : _ASM_ARG1 ); } void __init alternative_instructions(void) { int3_selftest(); /* * The patching is not fully atomic, so try to avoid local * interruptions that might execute the to be patched code. * Other CPUs are not running. */ stop_nmi(); /* * Don't stop machine check exceptions while patching. * MCEs only happen when something got corrupted and in this * case we must do something about the corruption. * Ignoring it is worse than an unlikely patching race. * Also machine checks tend to be broadcast and if one CPU * goes into machine check the others follow quickly, so we don't * expect a machine check to cause undue problems during to code * patching. */ /* * Make sure to set (artificial) features depending on used paravirt * functions which can later influence alternative patching. */ paravirt_set_cap(); __apply_fineibt(__retpoline_sites, __retpoline_sites_end, __cfi_sites, __cfi_sites_end, true); /* * Rewrite the retpolines, must be done before alternatives since * those can rewrite the retpoline thunks. */ apply_retpolines(__retpoline_sites, __retpoline_sites_end); apply_returns(__return_sites, __return_sites_end); apply_alternatives(__alt_instructions, __alt_instructions_end); /* * Now all calls are established. Apply the call thunks if * required. */ callthunks_patch_builtin_calls(); /* * Seal all functions that do not have their address taken. */ apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end); #ifdef CONFIG_SMP /* Patch to UP if other cpus not imminent. */ if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) { uniproc_patched = true; alternatives_smp_module_add(NULL, "core kernel", __smp_locks, __smp_locks_end, _text, _etext); } if (!uniproc_patched || num_possible_cpus() == 1) { free_init_pages("SMP alternatives", (unsigned long)__smp_locks, (unsigned long)__smp_locks_end); } #endif restart_nmi(); alternatives_patched = 1; alt_reloc_selftest(); } /** * text_poke_early - Update instructions on a live kernel at boot time * @addr: address to modify * @opcode: source of the copy * @len: length to copy * * When you use this code to patch more than one byte of an instruction * you need to make sure that other CPUs cannot execute this code in parallel. * Also no thread must be currently preempted in the middle of these * instructions. And on the local CPU you need to be protected against NMI or * MCE handlers seeing an inconsistent instruction while you patch. */ void __init_or_module text_poke_early(void *addr, const void *opcode, size_t len) { unsigned long flags; if (boot_cpu_has(X86_FEATURE_NX) && is_module_text_address((unsigned long)addr)) { /* * Modules text is marked initially as non-executable, so the * code cannot be running and speculative code-fetches are * prevented. Just change the code. */ memcpy(addr, opcode, len); } else { local_irq_save(flags); memcpy(addr, opcode, len); sync_core(); local_irq_restore(flags); /* * Could also do a CLFLUSH here to speed up CPU recovery; but * that causes hangs on some VIA CPUs. */ } } typedef struct { struct mm_struct *mm; } temp_mm_state_t; /* * Using a temporary mm allows to set temporary mappings that are not accessible * by other CPUs. Such mappings are needed to perform sensitive memory writes * that override the kernel memory protections (e.g., W^X), without exposing the * temporary page-table mappings that are required for these write operations to * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the * mapping is torn down. * * Context: The temporary mm needs to be used exclusively by a single core. To * harden security IRQs must be disabled while the temporary mm is * loaded, thereby preventing interrupt handler bugs from overriding * the kernel memory protection. */ static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm) { temp_mm_state_t temp_state; lockdep_assert_irqs_disabled(); /* * Make sure not to be in TLB lazy mode, as otherwise we'll end up * with a stale address space WITHOUT being in lazy mode after * restoring the previous mm. */ if (this_cpu_read(cpu_tlbstate_shared.is_lazy)) leave_mm(smp_processor_id()); temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm); switch_mm_irqs_off(NULL, mm, current); /* * If breakpoints are enabled, disable them while the temporary mm is * used. Userspace might set up watchpoints on addresses that are used * in the temporary mm, which would lead to wrong signals being sent or * crashes. * * Note that breakpoints are not disabled selectively, which also causes * kernel breakpoints (e.g., perf's) to be disabled. This might be * undesirable, but still seems reasonable as the code that runs in the * temporary mm should be short. */ if (hw_breakpoint_active()) hw_breakpoint_disable(); return temp_state; } static inline void unuse_temporary_mm(temp_mm_state_t prev_state) { lockdep_assert_irqs_disabled(); switch_mm_irqs_off(NULL, prev_state.mm, current); /* * Restore the breakpoints if they were disabled before the temporary mm * was loaded. */ if (hw_breakpoint_active()) hw_breakpoint_restore(); } __ro_after_init struct mm_struct *poking_mm; __ro_after_init unsigned long poking_addr; static void text_poke_memcpy(void *dst, const void *src, size_t len) { memcpy(dst, src, len); } static void text_poke_memset(void *dst, const void *src, size_t len) { int c = *(const int *)src; memset(dst, c, len); } typedef void text_poke_f(void *dst, const void *src, size_t len); static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t len) { bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE; struct page *pages[2] = {NULL}; temp_mm_state_t prev; unsigned long flags; pte_t pte, *ptep; spinlock_t *ptl; pgprot_t pgprot; /* * While boot memory allocator is running we cannot use struct pages as * they are not yet initialized. There is no way to recover. */ BUG_ON(!after_bootmem); if (!core_kernel_text((unsigned long)addr)) { pages[0] = vmalloc_to_page(addr); if (cross_page_boundary) pages[1] = vmalloc_to_page(addr + PAGE_SIZE); } else { pages[0] = virt_to_page(addr); WARN_ON(!PageReserved(pages[0])); if (cross_page_boundary) pages[1] = virt_to_page(addr + PAGE_SIZE); } /* * If something went wrong, crash and burn since recovery paths are not * implemented. */ BUG_ON(!pages[0] || (cross_page_boundary && !pages[1])); /* * Map the page without the global bit, as TLB flushing is done with * flush_tlb_mm_range(), which is intended for non-global PTEs. */ pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL); /* * The lock is not really needed, but this allows to avoid open-coding. */ ptep = get_locked_pte(poking_mm, poking_addr, &ptl); /* * This must not fail; preallocated in poking_init(). */ VM_BUG_ON(!ptep); local_irq_save(flags); pte = mk_pte(pages[0], pgprot); set_pte_at(poking_mm, poking_addr, ptep, pte); if (cross_page_boundary) { pte = mk_pte(pages[1], pgprot); set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte); } /* * Loading the temporary mm behaves as a compiler barrier, which * guarantees that the PTE will be set at the time memcpy() is done. */ prev = use_temporary_mm(poking_mm); kasan_disable_current(); func((u8 *)poking_addr + offset_in_page(addr), src, len); kasan_enable_current(); /* * Ensure that the PTE is only cleared after the instructions of memcpy * were issued by using a compiler barrier. */ barrier(); pte_clear(poking_mm, poking_addr, ptep); if (cross_page_boundary) pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1); /* * Loading the previous page-table hierarchy requires a serializing * instruction that already allows the core to see the updated version. * Xen-PV is assumed to serialize execution in a similar manner. */ unuse_temporary_mm(prev); /* * Flushing the TLB might involve IPIs, which would require enabled * IRQs, but not if the mm is not used, as it is in this point. */ flush_tlb_mm_range(poking_mm, poking_addr, poking_addr + (cross_page_boundary ? 2 : 1) * PAGE_SIZE, PAGE_SHIFT, false); if (func == text_poke_memcpy) { /* * If the text does not match what we just wrote then something is * fundamentally screwy; there's nothing we can really do about that. */ BUG_ON(memcmp(addr, src, len)); } local_irq_restore(flags); pte_unmap_unlock(ptep, ptl); return addr; } /** * text_poke - Update instructions on a live kernel * @addr: address to modify * @opcode: source of the copy * @len: length to copy * * Only atomic text poke/set should be allowed when not doing early patching. * It means the size must be writable atomically and the address must be aligned * in a way that permits an atomic write. It also makes sure we fit on a single * page. * * Note that the caller must ensure that if the modified code is part of a * module, the module would not be removed during poking. This can be achieved * by registering a module notifier, and ordering module removal and patching * through a mutex. */ void *text_poke(void *addr, const void *opcode, size_t len) { lockdep_assert_held(&text_mutex); return __text_poke(text_poke_memcpy, addr, opcode, len); } /** * text_poke_kgdb - Update instructions on a live kernel by kgdb * @addr: address to modify * @opcode: source of the copy * @len: length to copy * * Only atomic text poke/set should be allowed when not doing early patching. * It means the size must be writable atomically and the address must be aligned * in a way that permits an atomic write. It also makes sure we fit on a single * page. * * Context: should only be used by kgdb, which ensures no other core is running, * despite the fact it does not hold the text_mutex. */ void *text_poke_kgdb(void *addr, const void *opcode, size_t len) { return __text_poke(text_poke_memcpy, addr, opcode, len); } void *text_poke_copy_locked(void *addr, const void *opcode, size_t len, bool core_ok) { unsigned long start = (unsigned long)addr; size_t patched = 0; if (WARN_ON_ONCE(!core_ok && core_kernel_text(start))) return NULL; while (patched < len) { unsigned long ptr = start + patched; size_t s; s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched); __text_poke(text_poke_memcpy, (void *)ptr, opcode + patched, s); patched += s; } return addr; } /** * text_poke_copy - Copy instructions into (an unused part of) RX memory * @addr: address to modify * @opcode: source of the copy * @len: length to copy, could be more than 2x PAGE_SIZE * * Not safe against concurrent execution; useful for JITs to dump * new code blocks into unused regions of RX memory. Can be used in * conjunction with synchronize_rcu_tasks() to wait for existing * execution to quiesce after having made sure no existing functions * pointers are live. */ void *text_poke_copy(void *addr, const void *opcode, size_t len) { mutex_lock(&text_mutex); addr = text_poke_copy_locked(addr, opcode, len, false); mutex_unlock(&text_mutex); return addr; } /** * text_poke_set - memset into (an unused part of) RX memory * @addr: address to modify * @c: the byte to fill the area with * @len: length to copy, could be more than 2x PAGE_SIZE * * This is useful to overwrite unused regions of RX memory with illegal * instructions. */ void *text_poke_set(void *addr, int c, size_t len) { unsigned long start = (unsigned long)addr; size_t patched = 0; if (WARN_ON_ONCE(core_kernel_text(start))) return NULL; mutex_lock(&text_mutex); while (patched < len) { unsigned long ptr = start + patched; size_t s; s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched); __text_poke(text_poke_memset, (void *)ptr, (void *)&c, s); patched += s; } mutex_unlock(&text_mutex); return addr; } static void do_sync_core(void *info) { sync_core(); } void text_poke_sync(void) { on_each_cpu(do_sync_core, NULL, 1); } /* * NOTE: crazy scheme to allow patching Jcc.d32 but not increase the size of * this thing. When len == 6 everything is prefixed with 0x0f and we map * opcode to Jcc.d8, using len to distinguish. */ struct text_poke_loc { /* addr := _stext + rel_addr */ s32 rel_addr; s32 disp; u8 len; u8 opcode; const u8 text[POKE_MAX_OPCODE_SIZE]; /* see text_poke_bp_batch() */ u8 old; }; struct bp_patching_desc { struct text_poke_loc *vec; int nr_entries; atomic_t refs; }; static struct bp_patching_desc bp_desc; static __always_inline struct bp_patching_desc *try_get_desc(void) { struct bp_patching_desc *desc = &bp_desc; if (!raw_atomic_inc_not_zero(&desc->refs)) return NULL; return desc; } static __always_inline void put_desc(void) { struct bp_patching_desc *desc = &bp_desc; smp_mb__before_atomic(); raw_atomic_dec(&desc->refs); } static __always_inline void *text_poke_addr(struct text_poke_loc *tp) { return _stext + tp->rel_addr; } static __always_inline int patch_cmp(const void *key, const void *elt) { struct text_poke_loc *tp = (struct text_poke_loc *) elt; if (key < text_poke_addr(tp)) return -1; if (key > text_poke_addr(tp)) return 1; return 0; } noinstr int poke_int3_handler(struct pt_regs *regs) { struct bp_patching_desc *desc; struct text_poke_loc *tp; int ret = 0; void *ip; if (user_mode(regs)) return 0; /* * Having observed our INT3 instruction, we now must observe * bp_desc with non-zero refcount: * * bp_desc.refs = 1 INT3 * WMB RMB * write INT3 if (bp_desc.refs != 0) */ smp_rmb(); desc = try_get_desc(); if (!desc) return 0; /* * Discount the INT3. See text_poke_bp_batch(). */ ip = (void *) regs->ip - INT3_INSN_SIZE; /* * Skip the binary search if there is a single member in the vector. */ if (unlikely(desc->nr_entries > 1)) { tp = __inline_bsearch(ip, desc->vec, desc->nr_entries, sizeof(struct text_poke_loc), patch_cmp); if (!tp) goto out_put; } else { tp = desc->vec; if (text_poke_addr(tp) != ip) goto out_put; } ip += tp->len; switch (tp->opcode) { case INT3_INSN_OPCODE: /* * Someone poked an explicit INT3, they'll want to handle it, * do not consume. */ goto out_put; case RET_INSN_OPCODE: int3_emulate_ret(regs); break; case CALL_INSN_OPCODE: int3_emulate_call(regs, (long)ip + tp->disp); break; case JMP32_INSN_OPCODE: case JMP8_INSN_OPCODE: int3_emulate_jmp(regs, (long)ip + tp->disp); break; case 0x70 ... 0x7f: /* Jcc */ int3_emulate_jcc(regs, tp->opcode & 0xf, (long)ip, tp->disp); break; default: BUG(); } ret = 1; out_put: put_desc(); return ret; } #define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc)) static struct text_poke_loc tp_vec[TP_VEC_MAX]; static int tp_vec_nr; /** * text_poke_bp_batch() -- update instructions on live kernel on SMP * @tp: vector of instructions to patch * @nr_entries: number of entries in the vector * * Modify multi-byte instruction by using int3 breakpoint on SMP. * We completely avoid stop_machine() here, and achieve the * synchronization using int3 breakpoint. * * The way it is done: * - For each entry in the vector: * - add a int3 trap to the address that will be patched * - sync cores * - For each entry in the vector: * - update all but the first byte of the patched range * - sync cores * - For each entry in the vector: * - replace the first byte (int3) by the first byte of * replacing opcode * - sync cores */ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) { unsigned char int3 = INT3_INSN_OPCODE; unsigned int i; int do_sync; lockdep_assert_held(&text_mutex); bp_desc.vec = tp; bp_desc.nr_entries = nr_entries; /* * Corresponds to the implicit memory barrier in try_get_desc() to * ensure reading a non-zero refcount provides up to date bp_desc data. */ atomic_set_release(&bp_desc.refs, 1); /* * Function tracing can enable thousands of places that need to be * updated. This can take quite some time, and with full kernel debugging * enabled, this could cause the softlockup watchdog to trigger. * This function gets called every 256 entries added to be patched. * Call cond_resched() here to make sure that other tasks can get scheduled * while processing all the functions being patched. */ cond_resched(); /* * Corresponding read barrier in int3 notifier for making sure the * nr_entries and handler are correctly ordered wrt. patching. */ smp_wmb(); /* * First step: add a int3 trap to the address that will be patched. */ for (i = 0; i < nr_entries; i++) { tp[i].old = *(u8 *)text_poke_addr(&tp[i]); text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE); } text_poke_sync(); /* * Second step: update all but the first byte of the patched range. */ for (do_sync = 0, i = 0; i < nr_entries; i++) { u8 old[POKE_MAX_OPCODE_SIZE+1] = { tp[i].old, }; u8 _new[POKE_MAX_OPCODE_SIZE+1]; const u8 *new = tp[i].text; int len = tp[i].len; if (len - INT3_INSN_SIZE > 0) { memcpy(old + INT3_INSN_SIZE, text_poke_addr(&tp[i]) + INT3_INSN_SIZE, len - INT3_INSN_SIZE); if (len == 6) { _new[0] = 0x0f; memcpy(_new + 1, new, 5); new = _new; } text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE, new + INT3_INSN_SIZE, len - INT3_INSN_SIZE); do_sync++; } /* * Emit a perf event to record the text poke, primarily to * support Intel PT decoding which must walk the executable code * to reconstruct the trace. The flow up to here is: * - write INT3 byte * - IPI-SYNC * - write instruction tail * At this point the actual control flow will be through the * INT3 and handler and not hit the old or new instruction. * Intel PT outputs FUP/TIP packets for the INT3, so the flow * can still be decoded. Subsequently: * - emit RECORD_TEXT_POKE with the new instruction * - IPI-SYNC * - write first byte * - IPI-SYNC * So before the text poke event timestamp, the decoder will see * either the old instruction flow or FUP/TIP of INT3. After the * text poke event timestamp, the decoder will see either the * new instruction flow or FUP/TIP of INT3. Thus decoders can * use the timestamp as the point at which to modify the * executable code. * The old instruction is recorded so that the event can be * processed forwards or backwards. */ perf_event_text_poke(text_poke_addr(&tp[i]), old, len, new, len); } if (do_sync) { /* * According to Intel, this core syncing is very likely * not necessary and we'd be safe even without it. But * better safe than sorry (plus there's not only Intel). */ text_poke_sync(); } /* * Third step: replace the first byte (int3) by the first byte of * replacing opcode. */ for (do_sync = 0, i = 0; i < nr_entries; i++) { u8 byte = tp[i].text[0]; if (tp[i].len == 6) byte = 0x0f; if (byte == INT3_INSN_OPCODE) continue; text_poke(text_poke_addr(&tp[i]), &byte, INT3_INSN_SIZE); do_sync++; } if (do_sync) text_poke_sync(); /* * Remove and wait for refs to be zero. */ if (!atomic_dec_and_test(&bp_desc.refs)) atomic_cond_read_acquire(&bp_desc.refs, !VAL); } static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, const void *opcode, size_t len, const void *emulate) { struct insn insn; int ret, i = 0; if (len == 6) i = 1; memcpy((void *)tp->text, opcode+i, len-i); if (!emulate) emulate = opcode; ret = insn_decode_kernel(&insn, emulate); BUG_ON(ret < 0); tp->rel_addr = addr - (void *)_stext; tp->len = len; tp->opcode = insn.opcode.bytes[0]; if (is_jcc32(&insn)) { /* * Map Jcc.d32 onto Jcc.d8 and use len to distinguish. */ tp->opcode = insn.opcode.bytes[1] - 0x10; } switch (tp->opcode) { case RET_INSN_OPCODE: case JMP32_INSN_OPCODE: case JMP8_INSN_OPCODE: /* * Control flow instructions without implied execution of the * next instruction can be padded with INT3. */ for (i = insn.length; i < len; i++) BUG_ON(tp->text[i] != INT3_INSN_OPCODE); break; default: BUG_ON(len != insn.length); } switch (tp->opcode) { case INT3_INSN_OPCODE: case RET_INSN_OPCODE: break; case CALL_INSN_OPCODE: case JMP32_INSN_OPCODE: case JMP8_INSN_OPCODE: case 0x70 ... 0x7f: /* Jcc */ tp->disp = insn.immediate.value; break; default: /* assume NOP */ switch (len) { case 2: /* NOP2 -- emulate as JMP8+0 */ BUG_ON(memcmp(emulate, x86_nops[len], len)); tp->opcode = JMP8_INSN_OPCODE; tp->disp = 0; break; case 5: /* NOP5 -- emulate as JMP32+0 */ BUG_ON(memcmp(emulate, x86_nops[len], len)); tp->opcode = JMP32_INSN_OPCODE; tp->disp = 0; break; default: /* unknown instruction */ BUG(); } break; } } /* * We hard rely on the tp_vec being ordered; ensure this is so by flushing * early if needed. */ static bool tp_order_fail(void *addr) { struct text_poke_loc *tp; if (!tp_vec_nr) return false; if (!addr) /* force */ return true; tp = &tp_vec[tp_vec_nr - 1]; if ((unsigned long)text_poke_addr(tp) > (unsigned long)addr) return true; return false; } static void text_poke_flush(void *addr) { if (tp_vec_nr == TP_VEC_MAX || tp_order_fail(addr)) { text_poke_bp_batch(tp_vec, tp_vec_nr); tp_vec_nr = 0; } } void text_poke_finish(void) { text_poke_flush(NULL); } void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate) { struct text_poke_loc *tp; text_poke_flush(addr); tp = &tp_vec[tp_vec_nr++]; text_poke_loc_init(tp, addr, opcode, len, emulate); } /** * text_poke_bp() -- update instructions on live kernel on SMP * @addr: address to patch * @opcode: opcode of new instruction * @len: length to copy * @emulate: instruction to be emulated * * Update a single instruction with the vector in the stack, avoiding * dynamically allocated memory. This function should be used when it is * not possible to allocate memory. */ void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate) { struct text_poke_loc tp; text_poke_loc_init(&tp, addr, opcode, len, emulate); text_poke_bp_batch(&tp, 1); } |
234 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Cryptographic API for algorithms (i.e., low-level API). * * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> */ #ifndef _CRYPTO_ALGAPI_H #define _CRYPTO_ALGAPI_H #include <crypto/utils.h> #include <linux/align.h> #include <linux/cache.h> #include <linux/crypto.h> #include <linux/types.h> #include <linux/workqueue.h> /* * Maximum values for blocksize and alignmask, used to allocate * static buffers that are big enough for any combination of * algs and architectures. Ciphers have a lower maximum size. */ #define MAX_ALGAPI_BLOCKSIZE 160 #define MAX_ALGAPI_ALIGNMASK 127 #define MAX_CIPHER_BLOCKSIZE 16 #define MAX_CIPHER_ALIGNMASK 15 #ifdef ARCH_DMA_MINALIGN #define CRYPTO_DMA_ALIGN ARCH_DMA_MINALIGN #else #define CRYPTO_DMA_ALIGN CRYPTO_MINALIGN #endif #define CRYPTO_DMA_PADDING ((CRYPTO_DMA_ALIGN - 1) & ~(CRYPTO_MINALIGN - 1)) /* * Autoloaded crypto modules should only use a prefixed name to avoid allowing * arbitrary modules to be loaded. Loading from userspace may still need the * unprefixed names, so retains those aliases as well. * This uses __MODULE_INFO directly instead of MODULE_ALIAS because pre-4.3 * gcc (e.g. avr32 toolchain) uses __LINE__ for uniqueness, and this macro * expands twice on the same line. Instead, use a separate base name for the * alias. */ #define MODULE_ALIAS_CRYPTO(name) \ __MODULE_INFO(alias, alias_userspace, name); \ __MODULE_INFO(alias, alias_crypto, "crypto-" name) struct crypto_aead; struct crypto_instance; struct module; struct notifier_block; struct rtattr; struct scatterlist; struct seq_file; struct sk_buff; struct crypto_type { unsigned int (*ctxsize)(struct crypto_alg *alg, u32 type, u32 mask); unsigned int (*extsize)(struct crypto_alg *alg); int (*init_tfm)(struct crypto_tfm *tfm); void (*show)(struct seq_file *m, struct crypto_alg *alg); int (*report)(struct sk_buff *skb, struct crypto_alg *alg); void (*free)(struct crypto_instance *inst); #ifdef CONFIG_CRYPTO_STATS int (*report_stat)(struct sk_buff *skb, struct crypto_alg *alg); #endif unsigned int type; unsigned int maskclear; unsigned int maskset; unsigned int tfmsize; }; struct crypto_instance { struct crypto_alg alg; struct crypto_template *tmpl; union { /* Node in list of instances after registration. */ struct hlist_node list; /* List of attached spawns before registration. */ struct crypto_spawn *spawns; }; struct work_struct free_work; void *__ctx[] CRYPTO_MINALIGN_ATTR; }; struct crypto_template { struct list_head list; struct hlist_head instances; struct module *module; int (*create)(struct crypto_template *tmpl, struct rtattr **tb); char name[CRYPTO_MAX_ALG_NAME]; }; struct crypto_spawn { struct list_head list; struct crypto_alg *alg; union { /* Back pointer to instance after registration.*/ struct crypto_instance *inst; /* Spawn list pointer prior to registration. */ struct crypto_spawn *next; }; const struct crypto_type *frontend; u32 mask; bool dead; bool registered; }; struct crypto_queue { struct list_head list; struct list_head *backlog; unsigned int qlen; unsigned int max_qlen; }; struct scatter_walk { struct scatterlist *sg; unsigned int offset; }; struct crypto_attr_alg { char name[CRYPTO_MAX_ALG_NAME]; }; struct crypto_attr_type { u32 type; u32 mask; }; /* * Algorithm registration interface. */ int crypto_register_alg(struct crypto_alg *alg); void crypto_unregister_alg(struct crypto_alg *alg); int crypto_register_algs(struct crypto_alg *algs, int count); void crypto_unregister_algs(struct crypto_alg *algs, int count); void crypto_mod_put(struct crypto_alg *alg); int crypto_register_template(struct crypto_template *tmpl); int crypto_register_templates(struct crypto_template *tmpls, int count); void crypto_unregister_template(struct crypto_template *tmpl); void crypto_unregister_templates(struct crypto_template *tmpls, int count); struct crypto_template *crypto_lookup_template(const char *name); int crypto_register_instance(struct crypto_template *tmpl, struct crypto_instance *inst); void crypto_unregister_instance(struct crypto_instance *inst); int crypto_grab_spawn(struct crypto_spawn *spawn, struct crypto_instance *inst, const char *name, u32 type, u32 mask); void crypto_drop_spawn(struct crypto_spawn *spawn); struct crypto_tfm *crypto_spawn_tfm(struct crypto_spawn *spawn, u32 type, u32 mask); void *crypto_spawn_tfm2(struct crypto_spawn *spawn); struct crypto_attr_type *crypto_get_attr_type(struct rtattr **tb); int crypto_check_attr_type(struct rtattr **tb, u32 type, u32 *mask_ret); const char *crypto_attr_alg_name(struct rtattr *rta); int crypto_inst_setname(struct crypto_instance *inst, const char *name, struct crypto_alg *alg); void crypto_init_queue(struct crypto_queue *queue, unsigned int max_qlen); int crypto_enqueue_request(struct crypto_queue *queue, struct crypto_async_request *request); void crypto_enqueue_request_head(struct crypto_queue *queue, struct crypto_async_request *request); struct crypto_async_request *crypto_dequeue_request(struct crypto_queue *queue); static inline unsigned int crypto_queue_len(struct crypto_queue *queue) { return queue->qlen; } void crypto_inc(u8 *a, unsigned int size); static inline void *crypto_tfm_ctx(struct crypto_tfm *tfm) { return tfm->__crt_ctx; } static inline void *crypto_tfm_ctx_align(struct crypto_tfm *tfm, unsigned int align) { if (align <= crypto_tfm_ctx_alignment()) align = 1; return PTR_ALIGN(crypto_tfm_ctx(tfm), align); } static inline unsigned int crypto_dma_align(void) { return CRYPTO_DMA_ALIGN; } static inline unsigned int crypto_dma_padding(void) { return (crypto_dma_align() - 1) & ~(crypto_tfm_ctx_alignment() - 1); } static inline void *crypto_tfm_ctx_dma(struct crypto_tfm *tfm) { return crypto_tfm_ctx_align(tfm, crypto_dma_align()); } static inline struct crypto_instance *crypto_tfm_alg_instance( struct crypto_tfm *tfm) { return container_of(tfm->__crt_alg, struct crypto_instance, alg); } static inline void *crypto_instance_ctx(struct crypto_instance *inst) { return inst->__ctx; } static inline struct crypto_async_request *crypto_get_backlog( struct crypto_queue *queue) { return queue->backlog == &queue->list ? NULL : container_of(queue->backlog, struct crypto_async_request, list); } static inline u32 crypto_requires_off(struct crypto_attr_type *algt, u32 off) { return (algt->type ^ off) & algt->mask & off; } /* * When an algorithm uses another algorithm (e.g., if it's an instance of a * template), these are the flags that should always be set on the "outer" * algorithm if any "inner" algorithm has them set. */ #define CRYPTO_ALG_INHERITED_FLAGS \ (CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK | \ CRYPTO_ALG_ALLOCATES_MEMORY) /* * Given the type and mask that specify the flags restrictions on a template * instance being created, return the mask that should be passed to * crypto_grab_*() (along with type=0) to honor any request the user made to * have any of the CRYPTO_ALG_INHERITED_FLAGS clear. */ static inline u32 crypto_algt_inherited_mask(struct crypto_attr_type *algt) { return crypto_requires_off(algt, CRYPTO_ALG_INHERITED_FLAGS); } int crypto_register_notifier(struct notifier_block *nb); int crypto_unregister_notifier(struct notifier_block *nb); /* Crypto notification events. */ enum { CRYPTO_MSG_ALG_REQUEST, CRYPTO_MSG_ALG_REGISTER, CRYPTO_MSG_ALG_LOADED, }; static inline void crypto_request_complete(struct crypto_async_request *req, int err) { req->complete(req->data, err); } static inline u32 crypto_tfm_alg_type(struct crypto_tfm *tfm) { return tfm->__crt_alg->cra_flags & CRYPTO_ALG_TYPE_MASK; } #endif /* _CRYPTO_ALGAPI_H */ |
160 99 14 161 9 2043 70 8 42 263 11 15 140 98 5 98 94 171 1 4 180 4 512 10 22 10388 45 24 139 330 125 3 126 327 26 21 21 217 79 88 90 110 4 4 430 8 435 4 101 125 9 4 4 6 13 9 78 21 12 18 27 53 53 53 53 9 30 100 69 34 175 191 86 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NET_XFRM_H #define _NET_XFRM_H #include <linux/compiler.h> #include <linux/xfrm.h> #include <linux/spinlock.h> #include <linux/list.h> #include <linux/skbuff.h> #include <linux/socket.h> #include <linux/pfkeyv2.h> #include <linux/ipsec.h> #include <linux/in6.h> #include <linux/mutex.h> #include <linux/audit.h> #include <linux/slab.h> #include <linux/refcount.h> #include <linux/sockptr.h> #include <net/sock.h> #include <net/dst.h> #include <net/ip.h> #include <net/route.h> #include <net/ipv6.h> #include <net/ip6_fib.h> #include <net/flow.h> #include <net/gro_cells.h> #include <linux/interrupt.h> #ifdef CONFIG_XFRM_STATISTICS #include <net/snmp.h> #endif #define XFRM_PROTO_ESP 50 #define XFRM_PROTO_AH 51 #define XFRM_PROTO_COMP 108 #define XFRM_PROTO_IPIP 4 #define XFRM_PROTO_IPV6 41 #define XFRM_PROTO_ROUTING IPPROTO_ROUTING #define XFRM_PROTO_DSTOPTS IPPROTO_DSTOPTS #define XFRM_ALIGN4(len) (((len) + 3) & ~3) #define XFRM_ALIGN8(len) (((len) + 7) & ~7) #define MODULE_ALIAS_XFRM_MODE(family, encap) \ MODULE_ALIAS("xfrm-mode-" __stringify(family) "-" __stringify(encap)) #define MODULE_ALIAS_XFRM_TYPE(family, proto) \ MODULE_ALIAS("xfrm-type-" __stringify(family) "-" __stringify(proto)) #define MODULE_ALIAS_XFRM_OFFLOAD_TYPE(family, proto) \ MODULE_ALIAS("xfrm-offload-" __stringify(family) "-" __stringify(proto)) #ifdef CONFIG_XFRM_STATISTICS #define XFRM_INC_STATS(net, field) SNMP_INC_STATS((net)->mib.xfrm_statistics, field) #else #define XFRM_INC_STATS(net, field) ((void)(net)) #endif /* Organization of SPD aka "XFRM rules" ------------------------------------ Basic objects: - policy rule, struct xfrm_policy (=SPD entry) - bundle of transformations, struct dst_entry == struct xfrm_dst (=SA bundle) - instance of a transformer, struct xfrm_state (=SA) - template to clone xfrm_state, struct xfrm_tmpl SPD is plain linear list of xfrm_policy rules, ordered by priority. (To be compatible with existing pfkeyv2 implementations, many rules with priority of 0x7fffffff are allowed to exist and such rules are ordered in an unpredictable way, thanks to bsd folks.) Lookup is plain linear search until the first match with selector. If "action" is "block", then we prohibit the flow, otherwise: if "xfrms_nr" is zero, the flow passes untransformed. Otherwise, policy entry has list of up to XFRM_MAX_DEPTH transformations, described by templates xfrm_tmpl. Each template is resolved to a complete xfrm_state (see below) and we pack bundle of transformations to a dst_entry returned to requestor. dst -. xfrm .-> xfrm_state #1 |---. child .-> dst -. xfrm .-> xfrm_state #2 |---. child .-> dst -. xfrm .-> xfrm_state #3 |---. child .-> NULL Bundles are cached at xrfm_policy struct (field ->bundles). Resolution of xrfm_tmpl ----------------------- Template contains: 1. ->mode Mode: transport or tunnel 2. ->id.proto Protocol: AH/ESP/IPCOMP 3. ->id.daddr Remote tunnel endpoint, ignored for transport mode. Q: allow to resolve security gateway? 4. ->id.spi If not zero, static SPI. 5. ->saddr Local tunnel endpoint, ignored for transport mode. 6. ->algos List of allowed algos. Plain bitmask now. Q: ealgos, aalgos, calgos. What a mess... 7. ->share Sharing mode. Q: how to implement private sharing mode? To add struct sock* to flow id? Having this template we search through SAD searching for entries with appropriate mode/proto/algo, permitted by selector. If no appropriate entry found, it is requested from key manager. PROBLEMS: Q: How to find all the bundles referring to a physical path for PMTU discovery? Seems, dst should contain list of all parents... and enter to infinite locking hierarchy disaster. No! It is easier, we will not search for them, let them find us. We add genid to each dst plus pointer to genid of raw IP route, pmtu disc will update pmtu on raw IP route and increase its genid. dst_check() will see this for top level and trigger resyncing metrics. Plus, it will be made via sk->sk_dst_cache. Solved. */ struct xfrm_state_walk { struct list_head all; u8 state; u8 dying; u8 proto; u32 seq; struct xfrm_address_filter *filter; }; enum { XFRM_DEV_OFFLOAD_IN = 1, XFRM_DEV_OFFLOAD_OUT, XFRM_DEV_OFFLOAD_FWD, }; enum { XFRM_DEV_OFFLOAD_UNSPECIFIED, XFRM_DEV_OFFLOAD_CRYPTO, XFRM_DEV_OFFLOAD_PACKET, }; enum { XFRM_DEV_OFFLOAD_FLAG_ACQ = 1, }; struct xfrm_dev_offload { struct net_device *dev; netdevice_tracker dev_tracker; struct net_device *real_dev; unsigned long offload_handle; u8 dir : 2; u8 type : 2; u8 flags : 2; }; struct xfrm_mode { u8 encap; u8 family; u8 flags; }; /* Flags for xfrm_mode. */ enum { XFRM_MODE_FLAG_TUNNEL = 1, }; enum xfrm_replay_mode { XFRM_REPLAY_MODE_LEGACY, XFRM_REPLAY_MODE_BMP, XFRM_REPLAY_MODE_ESN, }; /* Full description of state of transformer. */ struct xfrm_state { possible_net_t xs_net; union { struct hlist_node gclist; struct hlist_node bydst; }; struct hlist_node bysrc; struct hlist_node byspi; struct hlist_node byseq; refcount_t refcnt; spinlock_t lock; struct xfrm_id id; struct xfrm_selector sel; struct xfrm_mark mark; u32 if_id; u32 tfcpad; u32 genid; /* Key manager bits */ struct xfrm_state_walk km; /* Parameters of this state. */ struct { u32 reqid; u8 mode; u8 replay_window; u8 aalgo, ealgo, calgo; u8 flags; u16 family; xfrm_address_t saddr; int header_len; int trailer_len; u32 extra_flags; struct xfrm_mark smark; } props; struct xfrm_lifetime_cfg lft; /* Data for transformer */ struct xfrm_algo_auth *aalg; struct xfrm_algo *ealg; struct xfrm_algo *calg; struct xfrm_algo_aead *aead; const char *geniv; /* mapping change rate limiting */ __be16 new_mapping_sport; u32 new_mapping; /* seconds */ u32 mapping_maxage; /* seconds for input SA */ /* Data for encapsulator */ struct xfrm_encap_tmpl *encap; struct sock __rcu *encap_sk; /* Data for care-of address */ xfrm_address_t *coaddr; /* IPComp needs an IPIP tunnel for handling uncompressed packets */ struct xfrm_state *tunnel; /* If a tunnel, number of users + 1 */ atomic_t tunnel_users; /* State for replay detection */ struct xfrm_replay_state replay; struct xfrm_replay_state_esn *replay_esn; /* Replay detection state at the time we sent the last notification */ struct xfrm_replay_state preplay; struct xfrm_replay_state_esn *preplay_esn; /* replay detection mode */ enum xfrm_replay_mode repl_mode; /* internal flag that only holds state for delayed aevent at the * moment */ u32 xflags; /* Replay detection notification settings */ u32 replay_maxage; u32 replay_maxdiff; /* Replay detection notification timer */ struct timer_list rtimer; /* Statistics */ struct xfrm_stats stats; struct xfrm_lifetime_cur curlft; struct hrtimer mtimer; struct xfrm_dev_offload xso; /* used to fix curlft->add_time when changing date */ long saved_tmo; /* Last used time */ time64_t lastused; struct page_frag xfrag; /* Reference to data common to all the instances of this * transformer. */ const struct xfrm_type *type; struct xfrm_mode inner_mode; struct xfrm_mode inner_mode_iaf; struct xfrm_mode outer_mode; const struct xfrm_type_offload *type_offload; /* Security context */ struct xfrm_sec_ctx *security; /* Private data of this transformer, format is opaque, * interpreted by xfrm_type methods. */ void *data; }; static inline struct net *xs_net(struct xfrm_state *x) { return read_pnet(&x->xs_net); } /* xflags - make enum if more show up */ #define XFRM_TIME_DEFER 1 #define XFRM_SOFT_EXPIRE 2 enum { XFRM_STATE_VOID, XFRM_STATE_ACQ, XFRM_STATE_VALID, XFRM_STATE_ERROR, XFRM_STATE_EXPIRED, XFRM_STATE_DEAD }; /* callback structure passed from either netlink or pfkey */ struct km_event { union { u32 hard; u32 proto; u32 byid; u32 aevent; u32 type; } data; u32 seq; u32 portid; u32 event; struct net *net; }; struct xfrm_if_decode_session_result { struct net *net; u32 if_id; }; struct xfrm_if_cb { bool (*decode_session)(struct sk_buff *skb, unsigned short family, struct xfrm_if_decode_session_result *res); }; void xfrm_if_register_cb(const struct xfrm_if_cb *ifcb); void xfrm_if_unregister_cb(void); struct net_device; struct xfrm_type; struct xfrm_dst; struct xfrm_policy_afinfo { struct dst_ops *dst_ops; struct dst_entry *(*dst_lookup)(struct net *net, int tos, int oif, const xfrm_address_t *saddr, const xfrm_address_t *daddr, u32 mark); int (*get_saddr)(struct net *net, int oif, xfrm_address_t *saddr, xfrm_address_t *daddr, u32 mark); int (*fill_dst)(struct xfrm_dst *xdst, struct net_device *dev, const struct flowi *fl); struct dst_entry *(*blackhole_route)(struct net *net, struct dst_entry *orig); }; int xfrm_policy_register_afinfo(const struct xfrm_policy_afinfo *afinfo, int family); void xfrm_policy_unregister_afinfo(const struct xfrm_policy_afinfo *afinfo); void km_policy_notify(struct xfrm_policy *xp, int dir, const struct km_event *c); void km_state_notify(struct xfrm_state *x, const struct km_event *c); struct xfrm_tmpl; int km_query(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_policy *pol); void km_state_expired(struct xfrm_state *x, int hard, u32 portid); int __xfrm_state_delete(struct xfrm_state *x); struct xfrm_state_afinfo { u8 family; u8 proto; const struct xfrm_type_offload *type_offload_esp; const struct xfrm_type *type_esp; const struct xfrm_type *type_ipip; const struct xfrm_type *type_ipip6; const struct xfrm_type *type_comp; const struct xfrm_type *type_ah; const struct xfrm_type *type_routing; const struct xfrm_type *type_dstopts; int (*output)(struct net *net, struct sock *sk, struct sk_buff *skb); int (*transport_finish)(struct sk_buff *skb, int async); void (*local_error)(struct sk_buff *skb, u32 mtu); }; int xfrm_state_register_afinfo(struct xfrm_state_afinfo *afinfo); int xfrm_state_unregister_afinfo(struct xfrm_state_afinfo *afinfo); struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family); struct xfrm_state_afinfo *xfrm_state_afinfo_get_rcu(unsigned int family); struct xfrm_input_afinfo { u8 family; bool is_ipip; int (*callback)(struct sk_buff *skb, u8 protocol, int err); }; int xfrm_input_register_afinfo(const struct xfrm_input_afinfo *afinfo); int xfrm_input_unregister_afinfo(const struct xfrm_input_afinfo *afinfo); void xfrm_flush_gc(void); void xfrm_state_delete_tunnel(struct xfrm_state *x); struct xfrm_type { struct module *owner; u8 proto; u8 flags; #define XFRM_TYPE_NON_FRAGMENT 1 #define XFRM_TYPE_REPLAY_PROT 2 #define XFRM_TYPE_LOCAL_COADDR 4 #define XFRM_TYPE_REMOTE_COADDR 8 int (*init_state)(struct xfrm_state *x, struct netlink_ext_ack *extack); void (*destructor)(struct xfrm_state *); int (*input)(struct xfrm_state *, struct sk_buff *skb); int (*output)(struct xfrm_state *, struct sk_buff *pskb); int (*reject)(struct xfrm_state *, struct sk_buff *, const struct flowi *); }; int xfrm_register_type(const struct xfrm_type *type, unsigned short family); void xfrm_unregister_type(const struct xfrm_type *type, unsigned short family); struct xfrm_type_offload { struct module *owner; u8 proto; void (*encap)(struct xfrm_state *, struct sk_buff *pskb); int (*input_tail)(struct xfrm_state *x, struct sk_buff *skb); int (*xmit)(struct xfrm_state *, struct sk_buff *pskb, netdev_features_t features); }; int xfrm_register_type_offload(const struct xfrm_type_offload *type, unsigned short family); void xfrm_unregister_type_offload(const struct xfrm_type_offload *type, unsigned short family); static inline int xfrm_af2proto(unsigned int family) { switch(family) { case AF_INET: return IPPROTO_IPIP; case AF_INET6: return IPPROTO_IPV6; default: return 0; } } static inline const struct xfrm_mode *xfrm_ip2inner_mode(struct xfrm_state *x, int ipproto) { if ((ipproto == IPPROTO_IPIP && x->props.family == AF_INET) || (ipproto == IPPROTO_IPV6 && x->props.family == AF_INET6)) return &x->inner_mode; else return &x->inner_mode_iaf; } struct xfrm_tmpl { /* id in template is interpreted as: * daddr - destination of tunnel, may be zero for transport mode. * spi - zero to acquire spi. Not zero if spi is static, then * daddr must be fixed too. * proto - AH/ESP/IPCOMP */ struct xfrm_id id; /* Source address of tunnel. Ignored, if it is not a tunnel. */ xfrm_address_t saddr; unsigned short encap_family; u32 reqid; /* Mode: transport, tunnel etc. */ u8 mode; /* Sharing mode: unique, this session only, this user only etc. */ u8 share; /* May skip this transfomration if no SA is found */ u8 optional; /* Skip aalgos/ealgos/calgos checks. */ u8 allalgs; /* Bit mask of algos allowed for acquisition */ u32 aalgos; u32 ealgos; u32 calgos; }; #define XFRM_MAX_DEPTH 6 #define XFRM_MAX_OFFLOAD_DEPTH 1 struct xfrm_policy_walk_entry { struct list_head all; u8 dead; }; struct xfrm_policy_walk { struct xfrm_policy_walk_entry walk; u8 type; u32 seq; }; struct xfrm_policy_queue { struct sk_buff_head hold_queue; struct timer_list hold_timer; unsigned long timeout; }; struct xfrm_policy { possible_net_t xp_net; struct hlist_node bydst; struct hlist_node byidx; /* This lock only affects elements except for entry. */ rwlock_t lock; refcount_t refcnt; u32 pos; struct timer_list timer; atomic_t genid; u32 priority; u32 index; u32 if_id; struct xfrm_mark mark; struct xfrm_selector selector; struct xfrm_lifetime_cfg lft; struct xfrm_lifetime_cur curlft; struct xfrm_policy_walk_entry walk; struct xfrm_policy_queue polq; bool bydst_reinsert; u8 type; u8 action; u8 flags; u8 xfrm_nr; u16 family; struct xfrm_sec_ctx *security; struct xfrm_tmpl xfrm_vec[XFRM_MAX_DEPTH]; struct hlist_node bydst_inexact_list; struct rcu_head rcu; struct xfrm_dev_offload xdo; }; static inline struct net *xp_net(const struct xfrm_policy *xp) { return read_pnet(&xp->xp_net); } struct xfrm_kmaddress { xfrm_address_t local; xfrm_address_t remote; u32 reserved; u16 family; }; struct xfrm_migrate { xfrm_address_t old_daddr; xfrm_address_t old_saddr; xfrm_address_t new_daddr; xfrm_address_t new_saddr; u8 proto; u8 mode; u16 reserved; u32 reqid; u16 old_family; u16 new_family; }; #define XFRM_KM_TIMEOUT 30 /* what happened */ #define XFRM_REPLAY_UPDATE XFRM_AE_CR #define XFRM_REPLAY_TIMEOUT XFRM_AE_CE /* default aevent timeout in units of 100ms */ #define XFRM_AE_ETIME 10 /* Async Event timer multiplier */ #define XFRM_AE_ETH_M 10 /* default seq threshold size */ #define XFRM_AE_SEQT_SIZE 2 struct xfrm_mgr { struct list_head list; int (*notify)(struct xfrm_state *x, const struct km_event *c); int (*acquire)(struct xfrm_state *x, struct xfrm_tmpl *, struct xfrm_policy *xp); struct xfrm_policy *(*compile_policy)(struct sock *sk, int opt, u8 *data, int len, int *dir); int (*new_mapping)(struct xfrm_state *x, xfrm_address_t *ipaddr, __be16 sport); int (*notify_policy)(struct xfrm_policy *x, int dir, const struct km_event *c); int (*report)(struct net *net, u8 proto, struct xfrm_selector *sel, xfrm_address_t *addr); int (*migrate)(const struct xfrm_selector *sel, u8 dir, u8 type, const struct xfrm_migrate *m, int num_bundles, const struct xfrm_kmaddress *k, const struct xfrm_encap_tmpl *encap); bool (*is_alive)(const struct km_event *c); }; void xfrm_register_km(struct xfrm_mgr *km); void xfrm_unregister_km(struct xfrm_mgr *km); struct xfrm_tunnel_skb_cb { union { struct inet_skb_parm h4; struct inet6_skb_parm h6; } header; union { struct ip_tunnel *ip4; struct ip6_tnl *ip6; } tunnel; }; #define XFRM_TUNNEL_SKB_CB(__skb) ((struct xfrm_tunnel_skb_cb *)&((__skb)->cb[0])) /* * This structure is used for the duration where packets are being * transformed by IPsec. As soon as the packet leaves IPsec the * area beyond the generic IP part may be overwritten. */ struct xfrm_skb_cb { struct xfrm_tunnel_skb_cb header; /* Sequence number for replay protection. */ union { struct { __u32 low; __u32 hi; } output; struct { __be32 low; __be32 hi; } input; } seq; }; #define XFRM_SKB_CB(__skb) ((struct xfrm_skb_cb *)&((__skb)->cb[0])) /* * This structure is used by the afinfo prepare_input/prepare_output functions * to transmit header information to the mode input/output functions. */ struct xfrm_mode_skb_cb { struct xfrm_tunnel_skb_cb header; /* Copied from header for IPv4, always set to zero and DF for IPv6. */ __be16 id; __be16 frag_off; /* IP header length (excluding options or extension headers). */ u8 ihl; /* TOS for IPv4, class for IPv6. */ u8 tos; /* TTL for IPv4, hop limitfor IPv6. */ u8 ttl; /* Protocol for IPv4, NH for IPv6. */ u8 protocol; /* Option length for IPv4, zero for IPv6. */ u8 optlen; /* Used by IPv6 only, zero for IPv4. */ u8 flow_lbl[3]; }; #define XFRM_MODE_SKB_CB(__skb) ((struct xfrm_mode_skb_cb *)&((__skb)->cb[0])) /* * This structure is used by the input processing to locate the SPI and * related information. */ struct xfrm_spi_skb_cb { struct xfrm_tunnel_skb_cb header; unsigned int daddroff; unsigned int family; __be32 seq; }; #define XFRM_SPI_SKB_CB(__skb) ((struct xfrm_spi_skb_cb *)&((__skb)->cb[0])) #ifdef CONFIG_AUDITSYSCALL static inline struct audit_buffer *xfrm_audit_start(const char *op) { struct audit_buffer *audit_buf = NULL; if (audit_enabled == AUDIT_OFF) return NULL; audit_buf = audit_log_start(audit_context(), GFP_ATOMIC, AUDIT_MAC_IPSEC_EVENT); if (audit_buf == NULL) return NULL; audit_log_format(audit_buf, "op=%s", op); return audit_buf; } static inline void xfrm_audit_helper_usrinfo(bool task_valid, struct audit_buffer *audit_buf) { const unsigned int auid = from_kuid(&init_user_ns, task_valid ? audit_get_loginuid(current) : INVALID_UID); const unsigned int ses = task_valid ? audit_get_sessionid(current) : AUDIT_SID_UNSET; audit_log_format(audit_buf, " auid=%u ses=%u", auid, ses); audit_log_task_context(audit_buf); } void xfrm_audit_policy_add(struct xfrm_policy *xp, int result, bool task_valid); void xfrm_audit_policy_delete(struct xfrm_policy *xp, int result, bool task_valid); void xfrm_audit_state_add(struct xfrm_state *x, int result, bool task_valid); void xfrm_audit_state_delete(struct xfrm_state *x, int result, bool task_valid); void xfrm_audit_state_replay_overflow(struct xfrm_state *x, struct sk_buff *skb); void xfrm_audit_state_replay(struct xfrm_state *x, struct sk_buff *skb, __be32 net_seq); void xfrm_audit_state_notfound_simple(struct sk_buff *skb, u16 family); void xfrm_audit_state_notfound(struct sk_buff *skb, u16 family, __be32 net_spi, __be32 net_seq); void xfrm_audit_state_icvfail(struct xfrm_state *x, struct sk_buff *skb, u8 proto); #else static inline void xfrm_audit_policy_add(struct xfrm_policy *xp, int result, bool task_valid) { } static inline void xfrm_audit_policy_delete(struct xfrm_policy *xp, int result, bool task_valid) { } static inline void xfrm_audit_state_add(struct xfrm_state *x, int result, bool task_valid) { } static inline void xfrm_audit_state_delete(struct xfrm_state *x, int result, bool task_valid) { } static inline void xfrm_audit_state_replay_overflow(struct xfrm_state *x, struct sk_buff *skb) { } static inline void xfrm_audit_state_replay(struct xfrm_state *x, struct sk_buff *skb, __be32 net_seq) { } static inline void xfrm_audit_state_notfound_simple(struct sk_buff *skb, u16 family) { } static inline void xfrm_audit_state_notfound(struct sk_buff *skb, u16 family, __be32 net_spi, __be32 net_seq) { } static inline void xfrm_audit_state_icvfail(struct xfrm_state *x, struct sk_buff *skb, u8 proto) { } #endif /* CONFIG_AUDITSYSCALL */ static inline void xfrm_pol_hold(struct xfrm_policy *policy) { if (likely(policy != NULL)) refcount_inc(&policy->refcnt); } void xfrm_policy_destroy(struct xfrm_policy *policy); static inline void xfrm_pol_put(struct xfrm_policy *policy) { if (refcount_dec_and_test(&policy->refcnt)) xfrm_policy_destroy(policy); } static inline void xfrm_pols_put(struct xfrm_policy **pols, int npols) { int i; for (i = npols - 1; i >= 0; --i) xfrm_pol_put(pols[i]); } void __xfrm_state_destroy(struct xfrm_state *, bool); static inline void __xfrm_state_put(struct xfrm_state *x) { refcount_dec(&x->refcnt); } static inline void xfrm_state_put(struct xfrm_state *x) { if (refcount_dec_and_test(&x->refcnt)) __xfrm_state_destroy(x, false); } static inline void xfrm_state_put_sync(struct xfrm_state *x) { if (refcount_dec_and_test(&x->refcnt)) __xfrm_state_destroy(x, true); } static inline void xfrm_state_hold(struct xfrm_state *x) { refcount_inc(&x->refcnt); } static inline bool addr_match(const void *token1, const void *token2, unsigned int prefixlen) { const __be32 *a1 = token1; const __be32 *a2 = token2; unsigned int pdw; unsigned int pbi; pdw = prefixlen >> 5; /* num of whole u32 in prefix */ pbi = prefixlen & 0x1f; /* num of bits in incomplete u32 in prefix */ if (pdw) if (memcmp(a1, a2, pdw << 2)) return false; if (pbi) { __be32 mask; mask = htonl((0xffffffff) << (32 - pbi)); if ((a1[pdw] ^ a2[pdw]) & mask) return false; } return true; } static inline bool addr4_match(__be32 a1, __be32 a2, u8 prefixlen) { /* C99 6.5.7 (3): u32 << 32 is undefined behaviour */ if (sizeof(long) == 4 && prefixlen == 0) return true; return !((a1 ^ a2) & htonl(~0UL << (32 - prefixlen))); } static __inline__ __be16 xfrm_flowi_sport(const struct flowi *fl, const union flowi_uli *uli) { __be16 port; switch(fl->flowi_proto) { case IPPROTO_TCP: case IPPROTO_UDP: case IPPROTO_UDPLITE: case IPPROTO_SCTP: port = uli->ports.sport; break; case IPPROTO_ICMP: case IPPROTO_ICMPV6: port = htons(uli->icmpt.type); break; case IPPROTO_MH: port = htons(uli->mht.type); break; case IPPROTO_GRE: port = htons(ntohl(uli->gre_key) >> 16); break; default: port = 0; /*XXX*/ } return port; } static __inline__ __be16 xfrm_flowi_dport(const struct flowi *fl, const union flowi_uli *uli) { __be16 port; switch(fl->flowi_proto) { case IPPROTO_TCP: case IPPROTO_UDP: case IPPROTO_UDPLITE: case IPPROTO_SCTP: port = uli->ports.dport; break; case IPPROTO_ICMP: case IPPROTO_ICMPV6: port = htons(uli->icmpt.code); break; case IPPROTO_GRE: port = htons(ntohl(uli->gre_key) & 0xffff); break; default: port = 0; /*XXX*/ } return port; } bool xfrm_selector_match(const struct xfrm_selector *sel, const struct flowi *fl, unsigned short family); #ifdef CONFIG_SECURITY_NETWORK_XFRM /* If neither has a context --> match * Otherwise, both must have a context and the sids, doi, alg must match */ static inline bool xfrm_sec_ctx_match(struct xfrm_sec_ctx *s1, struct xfrm_sec_ctx *s2) { return ((!s1 && !s2) || (s1 && s2 && (s1->ctx_sid == s2->ctx_sid) && (s1->ctx_doi == s2->ctx_doi) && (s1->ctx_alg == s2->ctx_alg))); } #else static inline bool xfrm_sec_ctx_match(struct xfrm_sec_ctx *s1, struct xfrm_sec_ctx *s2) { return true; } #endif /* A struct encoding bundle of transformations to apply to some set of flow. * * xdst->child points to the next element of bundle. * dst->xfrm points to an instanse of transformer. * * Due to unfortunate limitations of current routing cache, which we * have no time to fix, it mirrors struct rtable and bound to the same * routing key, including saddr,daddr. However, we can have many of * bundles differing by session id. All the bundles grow from a parent * policy rule. */ struct xfrm_dst { union { struct dst_entry dst; struct rtable rt; struct rt6_info rt6; } u; struct dst_entry *route; struct dst_entry *child; struct dst_entry *path; struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; int num_pols, num_xfrms; u32 xfrm_genid; u32 policy_genid; u32 route_mtu_cached; u32 child_mtu_cached; u32 route_cookie; u32 path_cookie; }; static inline struct dst_entry *xfrm_dst_path(const struct dst_entry *dst) { #ifdef CONFIG_XFRM if (dst->xfrm || (dst->flags & DST_XFRM_QUEUE)) { const struct xfrm_dst *xdst = (const struct xfrm_dst *) dst; return xdst->path; } #endif return (struct dst_entry *) dst; } static inline struct dst_entry *xfrm_dst_child(const struct dst_entry *dst) { #ifdef CONFIG_XFRM if (dst->xfrm || (dst->flags & DST_XFRM_QUEUE)) { struct xfrm_dst *xdst = (struct xfrm_dst *) dst; return xdst->child; } #endif return NULL; } #ifdef CONFIG_XFRM static inline void xfrm_dst_set_child(struct xfrm_dst *xdst, struct dst_entry *child) { xdst->child = child; } static inline void xfrm_dst_destroy(struct xfrm_dst *xdst) { xfrm_pols_put(xdst->pols, xdst->num_pols); dst_release(xdst->route); if (likely(xdst->u.dst.xfrm)) xfrm_state_put(xdst->u.dst.xfrm); } #endif void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev); struct xfrm_if_parms { int link; /* ifindex of underlying L2 interface */ u32 if_id; /* interface identifyer */ bool collect_md; }; struct xfrm_if { struct xfrm_if __rcu *next; /* next interface in list */ struct net_device *dev; /* virtual device associated with interface */ struct net *net; /* netns for packet i/o */ struct xfrm_if_parms p; /* interface parms */ struct gro_cells gro_cells; }; struct xfrm_offload { /* Output sequence number for replay protection on offloading. */ struct { __u32 low; __u32 hi; } seq; __u32 flags; #define SA_DELETE_REQ 1 #define CRYPTO_DONE 2 #define CRYPTO_NEXT_DONE 4 #define CRYPTO_FALLBACK 8 #define XFRM_GSO_SEGMENT 16 #define XFRM_GRO 32 /* 64 is free */ #define XFRM_DEV_RESUME 128 #define XFRM_XMIT 256 __u32 status; #define CRYPTO_SUCCESS 1 #define CRYPTO_GENERIC_ERROR 2 #define CRYPTO_TRANSPORT_AH_AUTH_FAILED 4 #define CRYPTO_TRANSPORT_ESP_AUTH_FAILED 8 #define CRYPTO_TUNNEL_AH_AUTH_FAILED 16 #define CRYPTO_TUNNEL_ESP_AUTH_FAILED 32 #define CRYPTO_INVALID_PACKET_SYNTAX 64 #define CRYPTO_INVALID_PROTOCOL 128 __u8 proto; __u8 inner_ipproto; }; struct sec_path { int len; int olen; int verified_cnt; struct xfrm_state *xvec[XFRM_MAX_DEPTH]; struct xfrm_offload ovec[XFRM_MAX_OFFLOAD_DEPTH]; }; struct sec_path *secpath_set(struct sk_buff *skb); static inline void secpath_reset(struct sk_buff *skb) { #ifdef CONFIG_XFRM skb_ext_del(skb, SKB_EXT_SEC_PATH); #endif } static inline int xfrm_addr_any(const xfrm_address_t *addr, unsigned short family) { switch (family) { case AF_INET: return addr->a4 == 0; case AF_INET6: return ipv6_addr_any(&addr->in6); } return 0; } static inline int __xfrm4_state_addr_cmp(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x) { return (tmpl->saddr.a4 && tmpl->saddr.a4 != x->props.saddr.a4); } static inline int __xfrm6_state_addr_cmp(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x) { return (!ipv6_addr_any((struct in6_addr*)&tmpl->saddr) && !ipv6_addr_equal((struct in6_addr *)&tmpl->saddr, (struct in6_addr*)&x->props.saddr)); } static inline int xfrm_state_addr_cmp(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x, unsigned short family) { switch (family) { case AF_INET: return __xfrm4_state_addr_cmp(tmpl, x); case AF_INET6: return __xfrm6_state_addr_cmp(tmpl, x); } return !0; } #ifdef CONFIG_XFRM static inline struct xfrm_state *xfrm_input_state(struct sk_buff *skb) { struct sec_path *sp = skb_sec_path(skb); return sp->xvec[sp->len - 1]; } #endif static inline struct xfrm_offload *xfrm_offload(struct sk_buff *skb) { #ifdef CONFIG_XFRM struct sec_path *sp = skb_sec_path(skb); if (!sp || !sp->olen || sp->len != sp->olen) return NULL; return &sp->ovec[sp->olen - 1]; #else return NULL; #endif } #ifdef CONFIG_XFRM int __xfrm_policy_check(struct sock *, int dir, struct sk_buff *skb, unsigned short family); static inline bool __xfrm_check_nopolicy(struct net *net, struct sk_buff *skb, int dir) { if (!net->xfrm.policy_count[dir] && !secpath_exists(skb)) return net->xfrm.policy_default[dir] == XFRM_USERPOLICY_ACCEPT; return false; } static inline bool __xfrm_check_dev_nopolicy(struct sk_buff *skb, int dir, unsigned short family) { if (dir != XFRM_POLICY_OUT && family == AF_INET) { /* same dst may be used for traffic originating from * devices with different policy settings. */ return IPCB(skb)->flags & IPSKB_NOPOLICY; } return skb_dst(skb) && (skb_dst(skb)->flags & DST_NOPOLICY); } static inline int __xfrm_policy_check2(struct sock *sk, int dir, struct sk_buff *skb, unsigned int family, int reverse) { struct net *net = dev_net(skb->dev); int ndir = dir | (reverse ? XFRM_POLICY_MASK + 1 : 0); struct xfrm_offload *xo = xfrm_offload(skb); struct xfrm_state *x; if (sk && sk->sk_policy[XFRM_POLICY_IN]) return __xfrm_policy_check(sk, ndir, skb, family); if (xo) { x = xfrm_input_state(skb); if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET) return (xo->flags & CRYPTO_DONE) && (xo->status & CRYPTO_SUCCESS); } return __xfrm_check_nopolicy(net, skb, dir) || __xfrm_check_dev_nopolicy(skb, dir, family) || __xfrm_policy_check(sk, ndir, skb, family); } static inline int xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, unsigned short family) { return __xfrm_policy_check2(sk, dir, skb, family, 0); } static inline int xfrm4_policy_check(struct sock *sk, int dir, struct sk_buff *skb) { return xfrm_policy_check(sk, dir, skb, AF_INET); } static inline int xfrm6_policy_check(struct sock *sk, int dir, struct sk_buff *skb) { return xfrm_policy_check(sk, dir, skb, AF_INET6); } static inline int xfrm4_policy_check_reverse(struct sock *sk, int dir, struct sk_buff *skb) { return __xfrm_policy_check2(sk, dir, skb, AF_INET, 1); } static inline int xfrm6_policy_check_reverse(struct sock *sk, int dir, struct sk_buff *skb) { return __xfrm_policy_check2(sk, dir, skb, AF_INET6, 1); } int __xfrm_decode_session(struct net *net, struct sk_buff *skb, struct flowi *fl, unsigned int family, int reverse); static inline int xfrm_decode_session(struct net *net, struct sk_buff *skb, struct flowi *fl, unsigned int family) { return __xfrm_decode_session(net, skb, fl, family, 0); } static inline int xfrm_decode_session_reverse(struct net *net, struct sk_buff *skb, struct flowi *fl, unsigned int family) { return __xfrm_decode_session(net, skb, fl, family, 1); } int __xfrm_route_forward(struct sk_buff *skb, unsigned short family); static inline int xfrm_route_forward(struct sk_buff *skb, unsigned short family) { struct net *net = dev_net(skb->dev); if (!net->xfrm.policy_count[XFRM_POLICY_OUT] && net->xfrm.policy_default[XFRM_POLICY_OUT] == XFRM_USERPOLICY_ACCEPT) return true; return (skb_dst(skb)->flags & DST_NOXFRM) || __xfrm_route_forward(skb, family); } static inline int xfrm4_route_forward(struct sk_buff *skb) { return xfrm_route_forward(skb, AF_INET); } static inline int xfrm6_route_forward(struct sk_buff *skb) { return xfrm_route_forward(skb, AF_INET6); } int __xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk); static inline int xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk) { if (!sk_fullsock(osk)) return 0; sk->sk_policy[0] = NULL; sk->sk_policy[1] = NULL; if (unlikely(osk->sk_policy[0] || osk->sk_policy[1])) return __xfrm_sk_clone_policy(sk, osk); return 0; } int xfrm_policy_delete(struct xfrm_policy *pol, int dir); static inline void xfrm_sk_free_policy(struct sock *sk) { struct xfrm_policy *pol; pol = rcu_dereference_protected(sk->sk_policy[0], 1); if (unlikely(pol != NULL)) { xfrm_policy_delete(pol, XFRM_POLICY_MAX); sk->sk_policy[0] = NULL; } pol = rcu_dereference_protected(sk->sk_policy[1], 1); if (unlikely(pol != NULL)) { xfrm_policy_delete(pol, XFRM_POLICY_MAX+1); sk->sk_policy[1] = NULL; } } #else static inline void xfrm_sk_free_policy(struct sock *sk) {} static inline int xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk) { return 0; } static inline int xfrm6_route_forward(struct sk_buff *skb) { return 1; } static inline int xfrm4_route_forward(struct sk_buff *skb) { return 1; } static inline int xfrm6_policy_check(struct sock *sk, int dir, struct sk_buff *skb) { return 1; } static inline int xfrm4_policy_check(struct sock *sk, int dir, struct sk_buff *skb) { return 1; } static inline int xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, unsigned short family) { return 1; } static inline int xfrm_decode_session_reverse(struct net *net, struct sk_buff *skb, struct flowi *fl, unsigned int family) { return -ENOSYS; } static inline int xfrm4_policy_check_reverse(struct sock *sk, int dir, struct sk_buff *skb) { return 1; } static inline int xfrm6_policy_check_reverse(struct sock *sk, int dir, struct sk_buff *skb) { return 1; } #endif static __inline__ xfrm_address_t *xfrm_flowi_daddr(const struct flowi *fl, unsigned short family) { switch (family){ case AF_INET: return (xfrm_address_t *)&fl->u.ip4.daddr; case AF_INET6: return (xfrm_address_t *)&fl->u.ip6.daddr; } return NULL; } static __inline__ xfrm_address_t *xfrm_flowi_saddr(const struct flowi *fl, unsigned short family) { switch (family){ case AF_INET: return (xfrm_address_t *)&fl->u.ip4.saddr; case AF_INET6: return (xfrm_address_t *)&fl->u.ip6.saddr; } return NULL; } static __inline__ void xfrm_flowi_addr_get(const struct flowi *fl, xfrm_address_t *saddr, xfrm_address_t *daddr, unsigned short family) { switch(family) { case AF_INET: memcpy(&saddr->a4, &fl->u.ip4.saddr, sizeof(saddr->a4)); memcpy(&daddr->a4, &fl->u.ip4.daddr, sizeof(daddr->a4)); break; case AF_INET6: saddr->in6 = fl->u.ip6.saddr; daddr->in6 = fl->u.ip6.daddr; break; } } static __inline__ int __xfrm4_state_addr_check(const struct xfrm_state *x, const xfrm_address_t *daddr, const xfrm_address_t *saddr) { if (daddr->a4 == x->id.daddr.a4 && (saddr->a4 == x->props.saddr.a4 || !saddr->a4 || !x->props.saddr.a4)) return 1; return 0; } static __inline__ int __xfrm6_state_addr_check(const struct xfrm_state *x, const xfrm_address_t *daddr, const xfrm_address_t *saddr) { if (ipv6_addr_equal((struct in6_addr *)daddr, (struct in6_addr *)&x->id.daddr) && (ipv6_addr_equal((struct in6_addr *)saddr, (struct in6_addr *)&x->props.saddr) || ipv6_addr_any((struct in6_addr *)saddr) || ipv6_addr_any((struct in6_addr *)&x->props.saddr))) return 1; return 0; } static __inline__ int xfrm_state_addr_check(const struct xfrm_state *x, const xfrm_address_t *daddr, const xfrm_address_t *saddr, unsigned short family) { switch (family) { case AF_INET: return __xfrm4_state_addr_check(x, daddr, saddr); case AF_INET6: return __xfrm6_state_addr_check(x, daddr, saddr); } return 0; } static __inline__ int xfrm_state_addr_flow_check(const struct xfrm_state *x, const struct flowi *fl, unsigned short family) { switch (family) { case AF_INET: return __xfrm4_state_addr_check(x, (const xfrm_address_t *)&fl->u.ip4.daddr, (const xfrm_address_t *)&fl->u.ip4.saddr); case AF_INET6: return __xfrm6_state_addr_check(x, (const xfrm_address_t *)&fl->u.ip6.daddr, (const xfrm_address_t *)&fl->u.ip6.saddr); } return 0; } static inline int xfrm_state_kern(const struct xfrm_state *x) { return atomic_read(&x->tunnel_users); } static inline bool xfrm_id_proto_valid(u8 proto) { switch (proto) { case IPPROTO_AH: case IPPROTO_ESP: case IPPROTO_COMP: #if IS_ENABLED(CONFIG_IPV6) case IPPROTO_ROUTING: case IPPROTO_DSTOPTS: #endif return true; default: return false; } } /* IPSEC_PROTO_ANY only matches 3 IPsec protocols, 0 could match all. */ static inline int xfrm_id_proto_match(u8 proto, u8 userproto) { return (!userproto || proto == userproto || (userproto == IPSEC_PROTO_ANY && (proto == IPPROTO_AH || proto == IPPROTO_ESP || proto == IPPROTO_COMP))); } /* * xfrm algorithm information */ struct xfrm_algo_aead_info { char *geniv; u16 icv_truncbits; }; struct xfrm_algo_auth_info { u16 icv_truncbits; u16 icv_fullbits; }; struct xfrm_algo_encr_info { char *geniv; u16 blockbits; u16 defkeybits; }; struct xfrm_algo_comp_info { u16 threshold; }; struct xfrm_algo_desc { char *name; char *compat; u8 available:1; u8 pfkey_supported:1; union { struct xfrm_algo_aead_info aead; struct xfrm_algo_auth_info auth; struct xfrm_algo_encr_info encr; struct xfrm_algo_comp_info comp; } uinfo; struct sadb_alg desc; }; /* XFRM protocol handlers. */ struct xfrm4_protocol { int (*handler)(struct sk_buff *skb); int (*input_handler)(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type); int (*cb_handler)(struct sk_buff *skb, int err); int (*err_handler)(struct sk_buff *skb, u32 info); struct xfrm4_protocol __rcu *next; int priority; }; struct xfrm6_protocol { int (*handler)(struct sk_buff *skb); int (*input_handler)(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type); int (*cb_handler)(struct sk_buff *skb, int err); int (*err_handler)(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info); struct xfrm6_protocol __rcu *next; int priority; }; /* XFRM tunnel handlers. */ struct xfrm_tunnel { int (*handler)(struct sk_buff *skb); int (*cb_handler)(struct sk_buff *skb, int err); int (*err_handler)(struct sk_buff *skb, u32 info); struct xfrm_tunnel __rcu *next; int priority; }; struct xfrm6_tunnel { int (*handler)(struct sk_buff *skb); int (*cb_handler)(struct sk_buff *skb, int err); int (*err_handler)(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info); struct xfrm6_tunnel __rcu *next; int priority; }; void xfrm_init(void); void xfrm4_init(void); int xfrm_state_init(struct net *net); void xfrm_state_fini(struct net *net); void xfrm4_state_init(void); void xfrm4_protocol_init(void); #ifdef CONFIG_XFRM int xfrm6_init(void); void xfrm6_fini(void); int xfrm6_state_init(void); void xfrm6_state_fini(void); int xfrm6_protocol_init(void); void xfrm6_protocol_fini(void); #else static inline int xfrm6_init(void) { return 0; } static inline void xfrm6_fini(void) { ; } #endif #ifdef CONFIG_XFRM_STATISTICS int xfrm_proc_init(struct net *net); void xfrm_proc_fini(struct net *net); #endif int xfrm_sysctl_init(struct net *net); #ifdef CONFIG_SYSCTL void xfrm_sysctl_fini(struct net *net); #else static inline void xfrm_sysctl_fini(struct net *net) { } #endif void xfrm_state_walk_init(struct xfrm_state_walk *walk, u8 proto, struct xfrm_address_filter *filter); int xfrm_state_walk(struct net *net, struct xfrm_state_walk *walk, int (*func)(struct xfrm_state *, int, void*), void *); void xfrm_state_walk_done(struct xfrm_state_walk *walk, struct net *net); struct xfrm_state *xfrm_state_alloc(struct net *net); void xfrm_state_free(struct xfrm_state *x); struct xfrm_state *xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, const struct flowi *fl, struct xfrm_tmpl *tmpl, struct xfrm_policy *pol, int *err, unsigned short family, u32 if_id); struct xfrm_state *xfrm_stateonly_find(struct net *net, u32 mark, u32 if_id, xfrm_address_t *daddr, xfrm_address_t *saddr, unsigned short family, u8 mode, u8 proto, u32 reqid); struct xfrm_state *xfrm_state_lookup_byspi(struct net *net, __be32 spi, unsigned short family); int xfrm_state_check_expire(struct xfrm_state *x); #ifdef CONFIG_XFRM_OFFLOAD static inline void xfrm_dev_state_update_curlft(struct xfrm_state *x) { struct xfrm_dev_offload *xdo = &x->xso; struct net_device *dev = xdo->dev; if (x->xso.type != XFRM_DEV_OFFLOAD_PACKET) return; if (dev && dev->xfrmdev_ops && dev->xfrmdev_ops->xdo_dev_state_update_curlft) dev->xfrmdev_ops->xdo_dev_state_update_curlft(x); } #else static inline void xfrm_dev_state_update_curlft(struct xfrm_state *x) {} #endif void xfrm_state_insert(struct xfrm_state *x); int xfrm_state_add(struct xfrm_state *x); int xfrm_state_update(struct xfrm_state *x); struct xfrm_state *xfrm_state_lookup(struct net *net, u32 mark, const xfrm_address_t *daddr, __be32 spi, u8 proto, unsigned short family); struct xfrm_state *xfrm_state_lookup_byaddr(struct net *net, u32 mark, const xfrm_address_t *daddr, const xfrm_address_t *saddr, u8 proto, unsigned short family); #ifdef CONFIG_XFRM_SUB_POLICY void xfrm_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n, unsigned short family); void xfrm_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n, unsigned short family); #else static inline void xfrm_tmpl_sort(struct xfrm_tmpl **d, struct xfrm_tmpl **s, int n, unsigned short family) { } static inline void xfrm_state_sort(struct xfrm_state **d, struct xfrm_state **s, int n, unsigned short family) { } #endif struct xfrmk_sadinfo { u32 sadhcnt; /* current hash bkts */ u32 sadhmcnt; /* max allowed hash bkts */ u32 sadcnt; /* current running count */ }; struct xfrmk_spdinfo { u32 incnt; u32 outcnt; u32 fwdcnt; u32 inscnt; u32 outscnt; u32 fwdscnt; u32 spdhcnt; u32 spdhmcnt; }; struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq); int xfrm_state_delete(struct xfrm_state *x); int xfrm_state_flush(struct net *net, u8 proto, bool task_valid, bool sync); int xfrm_dev_state_flush(struct net *net, struct net_device *dev, bool task_valid); int xfrm_dev_policy_flush(struct net *net, struct net_device *dev, bool task_valid); void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si); void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si); u32 xfrm_replay_seqhi(struct xfrm_state *x, __be32 net_seq); int xfrm_init_replay(struct xfrm_state *x, struct netlink_ext_ack *extack); u32 xfrm_state_mtu(struct xfrm_state *x, int mtu); int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload, struct netlink_ext_ack *extack); int xfrm_init_state(struct xfrm_state *x); int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type); int xfrm_input_resume(struct sk_buff *skb, int nexthdr); int xfrm_trans_queue_net(struct net *net, struct sk_buff *skb, int (*finish)(struct net *, struct sock *, struct sk_buff *)); int xfrm_trans_queue(struct sk_buff *skb, int (*finish)(struct net *, struct sock *, struct sk_buff *)); int xfrm_output_resume(struct sock *sk, struct sk_buff *skb, int err); int xfrm_output(struct sock *sk, struct sk_buff *skb); #if IS_ENABLED(CONFIG_NET_PKTGEN) int pktgen_xfrm_outer_mode_output(struct xfrm_state *x, struct sk_buff *skb); #endif void xfrm_local_error(struct sk_buff *skb, int mtu); int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type); int xfrm4_transport_finish(struct sk_buff *skb, int async); int xfrm4_rcv(struct sk_buff *skb); static inline int xfrm4_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi) { XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; XFRM_SPI_SKB_CB(skb)->family = AF_INET; XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); return xfrm_input(skb, nexthdr, spi, 0); } int xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb); int xfrm4_protocol_register(struct xfrm4_protocol *handler, unsigned char protocol); int xfrm4_protocol_deregister(struct xfrm4_protocol *handler, unsigned char protocol); int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family); int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family); void xfrm4_local_error(struct sk_buff *skb, u32 mtu); int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi, struct ip6_tnl *t); int xfrm6_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type); int xfrm6_transport_finish(struct sk_buff *skb, int async); int xfrm6_rcv_tnl(struct sk_buff *skb, struct ip6_tnl *t); int xfrm6_rcv(struct sk_buff *skb); int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr, xfrm_address_t *saddr, u8 proto); void xfrm6_local_error(struct sk_buff *skb, u32 mtu); int xfrm6_protocol_register(struct xfrm6_protocol *handler, unsigned char protocol); int xfrm6_protocol_deregister(struct xfrm6_protocol *handler, unsigned char protocol); int xfrm6_tunnel_register(struct xfrm6_tunnel *handler, unsigned short family); int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family); __be32 xfrm6_tunnel_alloc_spi(struct net *net, xfrm_address_t *saddr); __be32 xfrm6_tunnel_spi_lookup(struct net *net, const xfrm_address_t *saddr); int xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb); #ifdef CONFIG_XFRM void xfrm6_local_rxpmtu(struct sk_buff *skb, u32 mtu); int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb); int xfrm6_udp_encap_rcv(struct sock *sk, struct sk_buff *skb); struct sk_buff *xfrm4_gro_udp_encap_rcv(struct sock *sk, struct list_head *head, struct sk_buff *skb); struct sk_buff *xfrm6_gro_udp_encap_rcv(struct sock *sk, struct list_head *head, struct sk_buff *skb); int xfrm_user_policy(struct sock *sk, int optname, sockptr_t optval, int optlen); #else static inline int xfrm_user_policy(struct sock *sk, int optname, sockptr_t optval, int optlen) { return -ENOPROTOOPT; } #endif struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, int oif, const xfrm_address_t *saddr, const xfrm_address_t *daddr, int family, u32 mark); struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp); void xfrm_policy_walk_init(struct xfrm_policy_walk *walk, u8 type); int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk, int (*func)(struct xfrm_policy *, int, int, void*), void *); void xfrm_policy_walk_done(struct xfrm_policy_walk *walk, struct net *net); int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl); struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, const struct xfrm_mark *mark, u32 if_id, u8 type, int dir, struct xfrm_selector *sel, struct xfrm_sec_ctx *ctx, int delete, int *err); struct xfrm_policy *xfrm_policy_byid(struct net *net, const struct xfrm_mark *mark, u32 if_id, u8 type, int dir, u32 id, int delete, int *err); int xfrm_policy_flush(struct net *net, u8 type, bool task_valid); void xfrm_policy_hash_rebuild(struct net *net); u32 xfrm_get_acqseq(void); int verify_spi_info(u8 proto, u32 min, u32 max, struct netlink_ext_ack *extack); int xfrm_alloc_spi(struct xfrm_state *x, u32 minspi, u32 maxspi, struct netlink_ext_ack *extack); struct xfrm_state *xfrm_find_acq(struct net *net, const struct xfrm_mark *mark, u8 mode, u32 reqid, u32 if_id, u8 proto, const xfrm_address_t *daddr, const xfrm_address_t *saddr, int create, unsigned short family); int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol); #ifdef CONFIG_XFRM_MIGRATE int km_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, const struct xfrm_migrate *m, int num_bundles, const struct xfrm_kmaddress *k, const struct xfrm_encap_tmpl *encap); struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *net, u32 if_id); struct xfrm_state *xfrm_state_migrate(struct xfrm_state *x, struct xfrm_migrate *m, struct xfrm_encap_tmpl *encap); int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, struct xfrm_migrate *m, int num_bundles, struct xfrm_kmaddress *k, struct net *net, struct xfrm_encap_tmpl *encap, u32 if_id, struct netlink_ext_ack *extack); #endif int km_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, __be16 sport); void km_policy_expired(struct xfrm_policy *pol, int dir, int hard, u32 portid); int km_report(struct net *net, u8 proto, struct xfrm_selector *sel, xfrm_address_t *addr); void xfrm_input_init(void); int xfrm_parse_spi(struct sk_buff *skb, u8 nexthdr, __be32 *spi, __be32 *seq); void xfrm_probe_algs(void); int xfrm_count_pfkey_auth_supported(void); int xfrm_count_pfkey_enc_supported(void); struct xfrm_algo_desc *xfrm_aalg_get_byidx(unsigned int idx); struct xfrm_algo_desc *xfrm_ealg_get_byidx(unsigned int idx); struct xfrm_algo_desc *xfrm_aalg_get_byid(int alg_id); struct xfrm_algo_desc *xfrm_ealg_get_byid(int alg_id); struct xfrm_algo_desc *xfrm_calg_get_byid(int alg_id); struct xfrm_algo_desc *xfrm_aalg_get_byname(const char *name, int probe); struct xfrm_algo_desc *xfrm_ealg_get_byname(const char *name, int probe); struct xfrm_algo_desc *xfrm_calg_get_byname(const char *name, int probe); struct xfrm_algo_desc *xfrm_aead_get_byname(const char *name, int icv_len, int probe); static inline bool xfrm6_addr_equal(const xfrm_address_t *a, const xfrm_address_t *b) { return ipv6_addr_equal((const struct in6_addr *)a, (const struct in6_addr *)b); } static inline bool xfrm_addr_equal(const xfrm_address_t *a, const xfrm_address_t *b, sa_family_t family) { switch (family) { default: case AF_INET: return ((__force u32)a->a4 ^ (__force u32)b->a4) == 0; case AF_INET6: return xfrm6_addr_equal(a, b); } } static inline int xfrm_policy_id2dir(u32 index) { return index & 7; } #ifdef CONFIG_XFRM void xfrm_replay_advance(struct xfrm_state *x, __be32 net_seq); int xfrm_replay_check(struct xfrm_state *x, struct sk_buff *skb, __be32 net_seq); void xfrm_replay_notify(struct xfrm_state *x, int event); int xfrm_replay_overflow(struct xfrm_state *x, struct sk_buff *skb); int xfrm_replay_recheck(struct xfrm_state *x, struct sk_buff *skb, __be32 net_seq); static inline int xfrm_aevent_is_on(struct net *net) { struct sock *nlsk; int ret = 0; rcu_read_lock(); nlsk = rcu_dereference(net->xfrm.nlsk); if (nlsk) ret = netlink_has_listeners(nlsk, XFRMNLGRP_AEVENTS); rcu_read_unlock(); return ret; } static inline int xfrm_acquire_is_on(struct net *net) { struct sock *nlsk; int ret = 0; rcu_read_lock(); nlsk = rcu_dereference(net->xfrm.nlsk); if (nlsk) ret = netlink_has_listeners(nlsk, XFRMNLGRP_ACQUIRE); rcu_read_unlock(); return ret; } #endif static inline unsigned int aead_len(struct xfrm_algo_aead *alg) { return sizeof(*alg) + ((alg->alg_key_len + 7) / 8); } static inline unsigned int xfrm_alg_len(const struct xfrm_algo *alg) { return sizeof(*alg) + ((alg->alg_key_len + 7) / 8); } static inline unsigned int xfrm_alg_auth_len(const struct xfrm_algo_auth *alg) { return sizeof(*alg) + ((alg->alg_key_len + 7) / 8); } static inline unsigned int xfrm_replay_state_esn_len(struct xfrm_replay_state_esn *replay_esn) { return sizeof(*replay_esn) + replay_esn->bmp_len * sizeof(__u32); } #ifdef CONFIG_XFRM_MIGRATE static inline int xfrm_replay_clone(struct xfrm_state *x, struct xfrm_state *orig) { x->replay_esn = kmemdup(orig->replay_esn, xfrm_replay_state_esn_len(orig->replay_esn), GFP_KERNEL); if (!x->replay_esn) return -ENOMEM; x->preplay_esn = kmemdup(orig->preplay_esn, xfrm_replay_state_esn_len(orig->preplay_esn), GFP_KERNEL); if (!x->preplay_esn) return -ENOMEM; return 0; } static inline struct xfrm_algo_aead *xfrm_algo_aead_clone(struct xfrm_algo_aead *orig) { return kmemdup(orig, aead_len(orig), GFP_KERNEL); } static inline struct xfrm_algo *xfrm_algo_clone(struct xfrm_algo *orig) { return kmemdup(orig, xfrm_alg_len(orig), GFP_KERNEL); } static inline struct xfrm_algo_auth *xfrm_algo_auth_clone(struct xfrm_algo_auth *orig) { return kmemdup(orig, xfrm_alg_auth_len(orig), GFP_KERNEL); } static inline void xfrm_states_put(struct xfrm_state **states, int n) { int i; for (i = 0; i < n; i++) xfrm_state_put(*(states + i)); } static inline void xfrm_states_delete(struct xfrm_state **states, int n) { int i; for (i = 0; i < n; i++) xfrm_state_delete(*(states + i)); } #endif void __init xfrm_dev_init(void); #ifdef CONFIG_XFRM_OFFLOAD void xfrm_dev_resume(struct sk_buff *skb); void xfrm_dev_backlog(struct softnet_data *sd); struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features, bool *again); int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, struct xfrm_user_offload *xuo, struct netlink_ext_ack *extack); int xfrm_dev_policy_add(struct net *net, struct xfrm_policy *xp, struct xfrm_user_offload *xuo, u8 dir, struct netlink_ext_ack *extack); bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x); static inline void xfrm_dev_state_advance_esn(struct xfrm_state *x) { struct xfrm_dev_offload *xso = &x->xso; if (xso->dev && xso->dev->xfrmdev_ops->xdo_dev_state_advance_esn) xso->dev->xfrmdev_ops->xdo_dev_state_advance_esn(x); } static inline bool xfrm_dst_offload_ok(struct dst_entry *dst) { struct xfrm_state *x = dst->xfrm; struct xfrm_dst *xdst; if (!x || !x->type_offload) return false; xdst = (struct xfrm_dst *) dst; if (!x->xso.offload_handle && !xdst->child->xfrm) return true; if (x->xso.offload_handle && (x->xso.dev == xfrm_dst_path(dst)->dev) && !xdst->child->xfrm) return true; return false; } static inline void xfrm_dev_state_delete(struct xfrm_state *x) { struct xfrm_dev_offload *xso = &x->xso; if (xso->dev) xso->dev->xfrmdev_ops->xdo_dev_state_delete(x); } static inline void xfrm_dev_state_free(struct xfrm_state *x) { struct xfrm_dev_offload *xso = &x->xso; struct net_device *dev = xso->dev; if (dev && dev->xfrmdev_ops) { if (dev->xfrmdev_ops->xdo_dev_state_free) dev->xfrmdev_ops->xdo_dev_state_free(x); xso->dev = NULL; xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED; netdev_put(dev, &xso->dev_tracker); } } static inline void xfrm_dev_policy_delete(struct xfrm_policy *x) { struct xfrm_dev_offload *xdo = &x->xdo; struct net_device *dev = xdo->dev; if (dev && dev->xfrmdev_ops && dev->xfrmdev_ops->xdo_dev_policy_delete) dev->xfrmdev_ops->xdo_dev_policy_delete(x); } static inline void xfrm_dev_policy_free(struct xfrm_policy *x) { struct xfrm_dev_offload *xdo = &x->xdo; struct net_device *dev = xdo->dev; if (dev && dev->xfrmdev_ops) { if (dev->xfrmdev_ops->xdo_dev_policy_free) dev->xfrmdev_ops->xdo_dev_policy_free(x); xdo->dev = NULL; netdev_put(dev, &xdo->dev_tracker); } } #else static inline void xfrm_dev_resume(struct sk_buff *skb) { } static inline void xfrm_dev_backlog(struct softnet_data *sd) { } static inline struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features, bool *again) { return skb; } static inline int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, struct xfrm_user_offload *xuo, struct netlink_ext_ack *extack) { return 0; } static inline void xfrm_dev_state_delete(struct xfrm_state *x) { } static inline void xfrm_dev_state_free(struct xfrm_state *x) { } static inline int xfrm_dev_policy_add(struct net *net, struct xfrm_policy *xp, struct xfrm_user_offload *xuo, u8 dir, struct netlink_ext_ack *extack) { return 0; } static inline void xfrm_dev_policy_delete(struct xfrm_policy *x) { } static inline void xfrm_dev_policy_free(struct xfrm_policy *x) { } static inline bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x) { return false; } static inline void xfrm_dev_state_advance_esn(struct xfrm_state *x) { } static inline bool xfrm_dst_offload_ok(struct dst_entry *dst) { return false; } #endif static inline int xfrm_mark_get(struct nlattr **attrs, struct xfrm_mark *m) { if (attrs[XFRMA_MARK]) memcpy(m, nla_data(attrs[XFRMA_MARK]), sizeof(struct xfrm_mark)); else m->v = m->m = 0; return m->v & m->m; } static inline int xfrm_mark_put(struct sk_buff *skb, const struct xfrm_mark *m) { int ret = 0; if (m->m | m->v) ret = nla_put(skb, XFRMA_MARK, sizeof(struct xfrm_mark), m); return ret; } static inline __u32 xfrm_smark_get(__u32 mark, struct xfrm_state *x) { struct xfrm_mark *m = &x->props.smark; return (m->v & m->m) | (mark & ~m->m); } static inline int xfrm_if_id_put(struct sk_buff *skb, __u32 if_id) { int ret = 0; if (if_id) ret = nla_put_u32(skb, XFRMA_IF_ID, if_id); return ret; } static inline int xfrm_tunnel_check(struct sk_buff *skb, struct xfrm_state *x, unsigned int family) { bool tunnel = false; switch(family) { case AF_INET: if (XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4) tunnel = true; break; case AF_INET6: if (XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6) tunnel = true; break; } if (tunnel && !(x->outer_mode.flags & XFRM_MODE_FLAG_TUNNEL)) return -EINVAL; return 0; } extern const int xfrm_msg_min[XFRM_NR_MSGTYPES]; extern const struct nla_policy xfrma_policy[XFRMA_MAX+1]; struct xfrm_translator { /* Allocate frag_list and put compat translation there */ int (*alloc_compat)(struct sk_buff *skb, const struct nlmsghdr *src); /* Allocate nlmsg with 64-bit translaton of received 32-bit message */ struct nlmsghdr *(*rcv_msg_compat)(const struct nlmsghdr *nlh, int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack); /* Translate 32-bit user_policy from sockptr */ int (*xlate_user_policy_sockptr)(u8 **pdata32, int optlen); struct module *owner; }; #if IS_ENABLED(CONFIG_XFRM_USER_COMPAT) extern int xfrm_register_translator(struct xfrm_translator *xtr); extern int xfrm_unregister_translator(struct xfrm_translator *xtr); extern struct xfrm_translator *xfrm_get_translator(void); extern void xfrm_put_translator(struct xfrm_translator *xtr); #else static inline struct xfrm_translator *xfrm_get_translator(void) { return NULL; } static inline void xfrm_put_translator(struct xfrm_translator *xtr) { } #endif #if IS_ENABLED(CONFIG_IPV6) static inline bool xfrm6_local_dontfrag(const struct sock *sk) { int proto; if (!sk || sk->sk_family != AF_INET6) return false; proto = sk->sk_protocol; if (proto == IPPROTO_UDP || proto == IPPROTO_RAW) return inet6_test_bit(DONTFRAG, sk); return false; } #endif #if (IS_BUILTIN(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \ (IS_MODULE(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) extern struct metadata_dst __percpu *xfrm_bpf_md_dst; int register_xfrm_interface_bpf(void); #else static inline int register_xfrm_interface_bpf(void) { return 0; } #endif #if IS_ENABLED(CONFIG_DEBUG_INFO_BTF) int register_xfrm_state_bpf(void); #else static inline int register_xfrm_state_bpf(void) { return 0; } #endif #endif /* _NET_XFRM_H */ |
8 8 8 8 1 1 1 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (c) 2008, Intel Corporation. * * Author: Alexander Duyck <alexander.h.duyck@intel.com> */ #ifndef __NET_TC_SKBEDIT_H #define __NET_TC_SKBEDIT_H #include <net/act_api.h> #include <linux/tc_act/tc_skbedit.h> struct tcf_skbedit_params { u32 flags; u32 priority; u32 mark; u32 mask; u16 queue_mapping; u16 mapping_mod; u16 ptype; struct rcu_head rcu; }; struct tcf_skbedit { struct tc_action common; struct tcf_skbedit_params __rcu *params; }; #define to_skbedit(a) ((struct tcf_skbedit *)a) /* Return true iff action is the one identified by FLAG. */ static inline bool is_tcf_skbedit_with_flag(const struct tc_action *a, u32 flag) { #ifdef CONFIG_NET_CLS_ACT u32 flags; if (a->ops && a->ops->id == TCA_ID_SKBEDIT) { rcu_read_lock(); flags = rcu_dereference(to_skbedit(a)->params)->flags; rcu_read_unlock(); return flags == flag; } #endif return false; } /* Return true iff action is mark */ static inline bool is_tcf_skbedit_mark(const struct tc_action *a) { return is_tcf_skbedit_with_flag(a, SKBEDIT_F_MARK); } static inline u32 tcf_skbedit_mark(const struct tc_action *a) { u32 mark; rcu_read_lock(); mark = rcu_dereference(to_skbedit(a)->params)->mark; rcu_read_unlock(); return mark; } /* Return true iff action is ptype */ static inline bool is_tcf_skbedit_ptype(const struct tc_action *a) { return is_tcf_skbedit_with_flag(a, SKBEDIT_F_PTYPE); } static inline u32 tcf_skbedit_ptype(const struct tc_action *a) { u16 ptype; rcu_read_lock(); ptype = rcu_dereference(to_skbedit(a)->params)->ptype; rcu_read_unlock(); return ptype; } /* Return true iff action is priority */ static inline bool is_tcf_skbedit_priority(const struct tc_action *a) { return is_tcf_skbedit_with_flag(a, SKBEDIT_F_PRIORITY); } static inline u32 tcf_skbedit_priority(const struct tc_action *a) { u32 priority; rcu_read_lock(); priority = rcu_dereference(to_skbedit(a)->params)->priority; rcu_read_unlock(); return priority; } static inline u16 tcf_skbedit_rx_queue_mapping(const struct tc_action *a) { u16 rx_queue; rcu_read_lock(); rx_queue = rcu_dereference(to_skbedit(a)->params)->queue_mapping; rcu_read_unlock(); return rx_queue; } /* Return true iff action is queue_mapping */ static inline bool is_tcf_skbedit_queue_mapping(const struct tc_action *a) { return is_tcf_skbedit_with_flag(a, SKBEDIT_F_QUEUE_MAPPING); } /* Return true if action is on ingress traffic */ static inline bool is_tcf_skbedit_ingress(u32 flags) { return flags & TCA_ACT_FLAGS_AT_INGRESS; } static inline bool is_tcf_skbedit_tx_queue_mapping(const struct tc_action *a) { return is_tcf_skbedit_queue_mapping(a) && !is_tcf_skbedit_ingress(a->tcfa_flags); } static inline bool is_tcf_skbedit_rx_queue_mapping(const struct tc_action *a) { return is_tcf_skbedit_queue_mapping(a) && is_tcf_skbedit_ingress(a->tcfa_flags); } /* Return true iff action is inheritdsfield */ static inline bool is_tcf_skbedit_inheritdsfield(const struct tc_action *a) { return is_tcf_skbedit_with_flag(a, SKBEDIT_F_INHERITDSFIELD); } #endif /* __NET_TC_SKBEDIT_H */ |
6 2 1 1 1 1 1 4 1 1 1 1 1 5 3 1 1 6 2 1 1 1 1 5 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 | // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/namei.h> #include <linux/io_uring.h> #include <uapi/linux/io_uring.h> #include "../fs/internal.h" #include "io_uring.h" #include "fs.h" struct io_rename { struct file *file; int old_dfd; int new_dfd; struct filename *oldpath; struct filename *newpath; int flags; }; struct io_unlink { struct file *file; int dfd; int flags; struct filename *filename; }; struct io_mkdir { struct file *file; int dfd; umode_t mode; struct filename *filename; }; struct io_link { struct file *file; int old_dfd; int new_dfd; struct filename *oldpath; struct filename *newpath; int flags; }; int io_renameat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_rename *ren = io_kiocb_to_cmd(req, struct io_rename); const char __user *oldf, *newf; if (sqe->buf_index || sqe->splice_fd_in) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; ren->old_dfd = READ_ONCE(sqe->fd); oldf = u64_to_user_ptr(READ_ONCE(sqe->addr)); newf = u64_to_user_ptr(READ_ONCE(sqe->addr2)); ren->new_dfd = READ_ONCE(sqe->len); ren->flags = READ_ONCE(sqe->rename_flags); ren->oldpath = getname(oldf); if (IS_ERR(ren->oldpath)) return PTR_ERR(ren->oldpath); ren->newpath = getname(newf); if (IS_ERR(ren->newpath)) { putname(ren->oldpath); return PTR_ERR(ren->newpath); } req->flags |= REQ_F_NEED_CLEANUP; req->flags |= REQ_F_FORCE_ASYNC; return 0; } int io_renameat(struct io_kiocb *req, unsigned int issue_flags) { struct io_rename *ren = io_kiocb_to_cmd(req, struct io_rename); int ret; WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd, ren->newpath, ren->flags); req->flags &= ~REQ_F_NEED_CLEANUP; io_req_set_res(req, ret, 0); return IOU_OK; } void io_renameat_cleanup(struct io_kiocb *req) { struct io_rename *ren = io_kiocb_to_cmd(req, struct io_rename); putname(ren->oldpath); putname(ren->newpath); } int io_unlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_unlink *un = io_kiocb_to_cmd(req, struct io_unlink); const char __user *fname; if (sqe->off || sqe->len || sqe->buf_index || sqe->splice_fd_in) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; un->dfd = READ_ONCE(sqe->fd); un->flags = READ_ONCE(sqe->unlink_flags); if (un->flags & ~AT_REMOVEDIR) return -EINVAL; fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); un->filename = getname(fname); if (IS_ERR(un->filename)) return PTR_ERR(un->filename); req->flags |= REQ_F_NEED_CLEANUP; req->flags |= REQ_F_FORCE_ASYNC; return 0; } int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags) { struct io_unlink *un = io_kiocb_to_cmd(req, struct io_unlink); int ret; WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); if (un->flags & AT_REMOVEDIR) ret = do_rmdir(un->dfd, un->filename); else ret = do_unlinkat(un->dfd, un->filename); req->flags &= ~REQ_F_NEED_CLEANUP; io_req_set_res(req, ret, 0); return IOU_OK; } void io_unlinkat_cleanup(struct io_kiocb *req) { struct io_unlink *ul = io_kiocb_to_cmd(req, struct io_unlink); putname(ul->filename); } int io_mkdirat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_mkdir *mkd = io_kiocb_to_cmd(req, struct io_mkdir); const char __user *fname; if (sqe->off || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; mkd->dfd = READ_ONCE(sqe->fd); mkd->mode = READ_ONCE(sqe->len); fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); mkd->filename = getname(fname); if (IS_ERR(mkd->filename)) return PTR_ERR(mkd->filename); req->flags |= REQ_F_NEED_CLEANUP; req->flags |= REQ_F_FORCE_ASYNC; return 0; } int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags) { struct io_mkdir *mkd = io_kiocb_to_cmd(req, struct io_mkdir); int ret; WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode); req->flags &= ~REQ_F_NEED_CLEANUP; io_req_set_res(req, ret, 0); return IOU_OK; } void io_mkdirat_cleanup(struct io_kiocb *req) { struct io_mkdir *md = io_kiocb_to_cmd(req, struct io_mkdir); putname(md->filename); } int io_symlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_link *sl = io_kiocb_to_cmd(req, struct io_link); const char __user *oldpath, *newpath; if (sqe->len || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; sl->new_dfd = READ_ONCE(sqe->fd); oldpath = u64_to_user_ptr(READ_ONCE(sqe->addr)); newpath = u64_to_user_ptr(READ_ONCE(sqe->addr2)); sl->oldpath = getname(oldpath); if (IS_ERR(sl->oldpath)) return PTR_ERR(sl->oldpath); sl->newpath = getname(newpath); if (IS_ERR(sl->newpath)) { putname(sl->oldpath); return PTR_ERR(sl->newpath); } req->flags |= REQ_F_NEED_CLEANUP; req->flags |= REQ_F_FORCE_ASYNC; return 0; } int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags) { struct io_link *sl = io_kiocb_to_cmd(req, struct io_link); int ret; WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = do_symlinkat(sl->oldpath, sl->new_dfd, sl->newpath); req->flags &= ~REQ_F_NEED_CLEANUP; io_req_set_res(req, ret, 0); return IOU_OK; } int io_linkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_link *lnk = io_kiocb_to_cmd(req, struct io_link); const char __user *oldf, *newf; if (sqe->buf_index || sqe->splice_fd_in) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; lnk->old_dfd = READ_ONCE(sqe->fd); lnk->new_dfd = READ_ONCE(sqe->len); oldf = u64_to_user_ptr(READ_ONCE(sqe->addr)); newf = u64_to_user_ptr(READ_ONCE(sqe->addr2)); lnk->flags = READ_ONCE(sqe->hardlink_flags); lnk->oldpath = getname_uflags(oldf, lnk->flags); if (IS_ERR(lnk->oldpath)) return PTR_ERR(lnk->oldpath); lnk->newpath = getname(newf); if (IS_ERR(lnk->newpath)) { putname(lnk->oldpath); return PTR_ERR(lnk->newpath); } req->flags |= REQ_F_NEED_CLEANUP; req->flags |= REQ_F_FORCE_ASYNC; return 0; } int io_linkat(struct io_kiocb *req, unsigned int issue_flags) { struct io_link *lnk = io_kiocb_to_cmd(req, struct io_link); int ret; WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = do_linkat(lnk->old_dfd, lnk->oldpath, lnk->new_dfd, lnk->newpath, lnk->flags); req->flags &= ~REQ_F_NEED_CLEANUP; io_req_set_res(req, ret, 0); return IOU_OK; } void io_link_cleanup(struct io_kiocb *req) { struct io_link *sl = io_kiocb_to_cmd(req, struct io_link); putname(sl->oldpath); putname(sl->newpath); } |
2 2 2 2 2 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 | // SPDX-License-Identifier: GPL-2.0 /* * net/tipc/crypto.c: TIPC crypto for key handling & packet en/decryption * * Copyright (c) 2019, Ericsson AB * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <crypto/aead.h> #include <crypto/aes.h> #include <crypto/rng.h> #include "crypto.h" #include "msg.h" #include "bcast.h" #define TIPC_TX_GRACE_PERIOD msecs_to_jiffies(5000) /* 5s */ #define TIPC_TX_LASTING_TIME msecs_to_jiffies(10000) /* 10s */ #define TIPC_RX_ACTIVE_LIM msecs_to_jiffies(3000) /* 3s */ #define TIPC_RX_PASSIVE_LIM msecs_to_jiffies(15000) /* 15s */ #define TIPC_MAX_TFMS_DEF 10 #define TIPC_MAX_TFMS_LIM 1000 #define TIPC_REKEYING_INTV_DEF (60 * 24) /* default: 1 day */ /* * TIPC Key ids */ enum { KEY_MASTER = 0, KEY_MIN = KEY_MASTER, KEY_1 = 1, KEY_2, KEY_3, KEY_MAX = KEY_3, }; /* * TIPC Crypto statistics */ enum { STAT_OK, STAT_NOK, STAT_ASYNC, STAT_ASYNC_OK, STAT_ASYNC_NOK, STAT_BADKEYS, /* tx only */ STAT_BADMSGS = STAT_BADKEYS, /* rx only */ STAT_NOKEYS, STAT_SWITCHES, MAX_STATS, }; /* TIPC crypto statistics' header */ static const char *hstats[MAX_STATS] = {"ok", "nok", "async", "async_ok", "async_nok", "badmsgs", "nokeys", "switches"}; /* Max TFMs number per key */ int sysctl_tipc_max_tfms __read_mostly = TIPC_MAX_TFMS_DEF; /* Key exchange switch, default: on */ int sysctl_tipc_key_exchange_enabled __read_mostly = 1; /* * struct tipc_key - TIPC keys' status indicator * * 7 6 5 4 3 2 1 0 * +-----+-----+-----+-----+-----+-----+-----+-----+ * key: | (reserved)|passive idx| active idx|pending idx| * +-----+-----+-----+-----+-----+-----+-----+-----+ */ struct tipc_key { #define KEY_BITS (2) #define KEY_MASK ((1 << KEY_BITS) - 1) union { struct { #if defined(__LITTLE_ENDIAN_BITFIELD) u8 pending:2, active:2, passive:2, /* rx only */ reserved:2; #elif defined(__BIG_ENDIAN_BITFIELD) u8 reserved:2, passive:2, /* rx only */ active:2, pending:2; #else #error "Please fix <asm/byteorder.h>" #endif } __packed; u8 keys; }; }; /** * struct tipc_tfm - TIPC TFM structure to form a list of TFMs * @tfm: cipher handle/key * @list: linked list of TFMs */ struct tipc_tfm { struct crypto_aead *tfm; struct list_head list; }; /** * struct tipc_aead - TIPC AEAD key structure * @tfm_entry: per-cpu pointer to one entry in TFM list * @crypto: TIPC crypto owns this key * @cloned: reference to the source key in case cloning * @users: the number of the key users (TX/RX) * @salt: the key's SALT value * @authsize: authentication tag size (max = 16) * @mode: crypto mode is applied to the key * @hint: a hint for user key * @rcu: struct rcu_head * @key: the aead key * @gen: the key's generation * @seqno: the key seqno (cluster scope) * @refcnt: the key reference counter */ struct tipc_aead { #define TIPC_AEAD_HINT_LEN (5) struct tipc_tfm * __percpu *tfm_entry; struct tipc_crypto *crypto; struct tipc_aead *cloned; atomic_t users; u32 salt; u8 authsize; u8 mode; char hint[2 * TIPC_AEAD_HINT_LEN + 1]; struct rcu_head rcu; struct tipc_aead_key *key; u16 gen; atomic64_t seqno ____cacheline_aligned; refcount_t refcnt ____cacheline_aligned; } ____cacheline_aligned; /** * struct tipc_crypto_stats - TIPC Crypto statistics * @stat: array of crypto statistics */ struct tipc_crypto_stats { unsigned int stat[MAX_STATS]; }; /** * struct tipc_crypto - TIPC TX/RX crypto structure * @net: struct net * @node: TIPC node (RX) * @aead: array of pointers to AEAD keys for encryption/decryption * @peer_rx_active: replicated peer RX active key index * @key_gen: TX/RX key generation * @key: the key states * @skey_mode: session key's mode * @skey: received session key * @wq: common workqueue on TX crypto * @work: delayed work sched for TX/RX * @key_distr: key distributing state * @rekeying_intv: rekeying interval (in minutes) * @stats: the crypto statistics * @name: the crypto name * @sndnxt: the per-peer sndnxt (TX) * @timer1: general timer 1 (jiffies) * @timer2: general timer 2 (jiffies) * @working: the crypto is working or not * @key_master: flag indicates if master key exists * @legacy_user: flag indicates if a peer joins w/o master key (for bwd comp.) * @nokey: no key indication * @flags: combined flags field * @lock: tipc_key lock */ struct tipc_crypto { struct net *net; struct tipc_node *node; struct tipc_aead __rcu *aead[KEY_MAX + 1]; atomic_t peer_rx_active; u16 key_gen; struct tipc_key key; u8 skey_mode; struct tipc_aead_key *skey; struct workqueue_struct *wq; struct delayed_work work; #define KEY_DISTR_SCHED 1 #define KEY_DISTR_COMPL 2 atomic_t key_distr; u32 rekeying_intv; struct tipc_crypto_stats __percpu *stats; char name[48]; atomic64_t sndnxt ____cacheline_aligned; unsigned long timer1; unsigned long timer2; union { struct { u8 working:1; u8 key_master:1; u8 legacy_user:1; u8 nokey: 1; }; u8 flags; }; spinlock_t lock; /* crypto lock */ } ____cacheline_aligned; /* struct tipc_crypto_tx_ctx - TX context for callbacks */ struct tipc_crypto_tx_ctx { struct tipc_aead *aead; struct tipc_bearer *bearer; struct tipc_media_addr dst; }; /* struct tipc_crypto_rx_ctx - RX context for callbacks */ struct tipc_crypto_rx_ctx { struct tipc_aead *aead; struct tipc_bearer *bearer; }; static struct tipc_aead *tipc_aead_get(struct tipc_aead __rcu *aead); static inline void tipc_aead_put(struct tipc_aead *aead); static void tipc_aead_free(struct rcu_head *rp); static int tipc_aead_users(struct tipc_aead __rcu *aead); static void tipc_aead_users_inc(struct tipc_aead __rcu *aead, int lim); static void tipc_aead_users_dec(struct tipc_aead __rcu *aead, int lim); static void tipc_aead_users_set(struct tipc_aead __rcu *aead, int val); static struct crypto_aead *tipc_aead_tfm_next(struct tipc_aead *aead); static int tipc_aead_init(struct tipc_aead **aead, struct tipc_aead_key *ukey, u8 mode); static int tipc_aead_clone(struct tipc_aead **dst, struct tipc_aead *src); static void *tipc_aead_mem_alloc(struct crypto_aead *tfm, unsigned int crypto_ctx_size, u8 **iv, struct aead_request **req, struct scatterlist **sg, int nsg); static int tipc_aead_encrypt(struct tipc_aead *aead, struct sk_buff *skb, struct tipc_bearer *b, struct tipc_media_addr *dst, struct tipc_node *__dnode); static void tipc_aead_encrypt_done(void *data, int err); static int tipc_aead_decrypt(struct net *net, struct tipc_aead *aead, struct sk_buff *skb, struct tipc_bearer *b); static void tipc_aead_decrypt_done(void *data, int err); static inline int tipc_ehdr_size(struct tipc_ehdr *ehdr); static int tipc_ehdr_build(struct net *net, struct tipc_aead *aead, u8 tx_key, struct sk_buff *skb, struct tipc_crypto *__rx); static inline void tipc_crypto_key_set_state(struct tipc_crypto *c, u8 new_passive, u8 new_active, u8 new_pending); static int tipc_crypto_key_attach(struct tipc_crypto *c, struct tipc_aead *aead, u8 pos, bool master_key); static bool tipc_crypto_key_try_align(struct tipc_crypto *rx, u8 new_pending); static struct tipc_aead *tipc_crypto_key_pick_tx(struct tipc_crypto *tx, struct tipc_crypto *rx, struct sk_buff *skb, u8 tx_key); static void tipc_crypto_key_synch(struct tipc_crypto *rx, struct sk_buff *skb); static int tipc_crypto_key_revoke(struct net *net, u8 tx_key); static inline void tipc_crypto_clone_msg(struct net *net, struct sk_buff *_skb, struct tipc_bearer *b, struct tipc_media_addr *dst, struct tipc_node *__dnode, u8 type); static void tipc_crypto_rcv_complete(struct net *net, struct tipc_aead *aead, struct tipc_bearer *b, struct sk_buff **skb, int err); static void tipc_crypto_do_cmd(struct net *net, int cmd); static char *tipc_crypto_key_dump(struct tipc_crypto *c, char *buf); static char *tipc_key_change_dump(struct tipc_key old, struct tipc_key new, char *buf); static int tipc_crypto_key_xmit(struct net *net, struct tipc_aead_key *skey, u16 gen, u8 mode, u32 dnode); static bool tipc_crypto_key_rcv(struct tipc_crypto *rx, struct tipc_msg *hdr); static void tipc_crypto_work_tx(struct work_struct *work); static void tipc_crypto_work_rx(struct work_struct *work); static int tipc_aead_key_generate(struct tipc_aead_key *skey); #define is_tx(crypto) (!(crypto)->node) #define is_rx(crypto) (!is_tx(crypto)) #define key_next(cur) ((cur) % KEY_MAX + 1) #define tipc_aead_rcu_ptr(rcu_ptr, lock) \ rcu_dereference_protected((rcu_ptr), lockdep_is_held(lock)) #define tipc_aead_rcu_replace(rcu_ptr, ptr, lock) \ do { \ struct tipc_aead *__tmp = rcu_dereference_protected((rcu_ptr), \ lockdep_is_held(lock)); \ rcu_assign_pointer((rcu_ptr), (ptr)); \ tipc_aead_put(__tmp); \ } while (0) #define tipc_crypto_key_detach(rcu_ptr, lock) \ tipc_aead_rcu_replace((rcu_ptr), NULL, lock) /** * tipc_aead_key_validate - Validate a AEAD user key * @ukey: pointer to user key data * @info: netlink info pointer */ int tipc_aead_key_validate(struct tipc_aead_key *ukey, struct genl_info *info) { int keylen; /* Check if algorithm exists */ if (unlikely(!crypto_has_alg(ukey->alg_name, 0, 0))) { GENL_SET_ERR_MSG(info, "unable to load the algorithm (module existed?)"); return -ENODEV; } /* Currently, we only support the "gcm(aes)" cipher algorithm */ if (strcmp(ukey->alg_name, "gcm(aes)")) { GENL_SET_ERR_MSG(info, "not supported yet the algorithm"); return -ENOTSUPP; } /* Check if key size is correct */ keylen = ukey->keylen - TIPC_AES_GCM_SALT_SIZE; if (unlikely(keylen != TIPC_AES_GCM_KEY_SIZE_128 && keylen != TIPC_AES_GCM_KEY_SIZE_192 && keylen != TIPC_AES_GCM_KEY_SIZE_256)) { GENL_SET_ERR_MSG(info, "incorrect key length (20, 28 or 36 octets?)"); return -EKEYREJECTED; } return 0; } /** * tipc_aead_key_generate - Generate new session key * @skey: input/output key with new content * * Return: 0 in case of success, otherwise < 0 */ static int tipc_aead_key_generate(struct tipc_aead_key *skey) { int rc = 0; /* Fill the key's content with a random value via RNG cipher */ rc = crypto_get_default_rng(); if (likely(!rc)) { rc = crypto_rng_get_bytes(crypto_default_rng, skey->key, skey->keylen); crypto_put_default_rng(); } return rc; } static struct tipc_aead *tipc_aead_get(struct tipc_aead __rcu *aead) { struct tipc_aead *tmp; rcu_read_lock(); tmp = rcu_dereference(aead); if (unlikely(!tmp || !refcount_inc_not_zero(&tmp->refcnt))) tmp = NULL; rcu_read_unlock(); return tmp; } static inline void tipc_aead_put(struct tipc_aead *aead) { if (aead && refcount_dec_and_test(&aead->refcnt)) call_rcu(&aead->rcu, tipc_aead_free); } /** * tipc_aead_free - Release AEAD key incl. all the TFMs in the list * @rp: rcu head pointer */ static void tipc_aead_free(struct rcu_head *rp) { struct tipc_aead *aead = container_of(rp, struct tipc_aead, rcu); struct tipc_tfm *tfm_entry, *head, *tmp; if (aead->cloned) { tipc_aead_put(aead->cloned); } else { head = *get_cpu_ptr(aead->tfm_entry); put_cpu_ptr(aead->tfm_entry); list_for_each_entry_safe(tfm_entry, tmp, &head->list, list) { crypto_free_aead(tfm_entry->tfm); list_del(&tfm_entry->list); kfree(tfm_entry); } /* Free the head */ crypto_free_aead(head->tfm); list_del(&head->list); kfree(head); } free_percpu(aead->tfm_entry); kfree_sensitive(aead->key); kfree(aead); } static int tipc_aead_users(struct tipc_aead __rcu *aead) { struct tipc_aead *tmp; int users = 0; rcu_read_lock(); tmp = rcu_dereference(aead); if (tmp) users = atomic_read(&tmp->users); rcu_read_unlock(); return users; } static void tipc_aead_users_inc(struct tipc_aead __rcu *aead, int lim) { struct tipc_aead *tmp; rcu_read_lock(); tmp = rcu_dereference(aead); if (tmp) atomic_add_unless(&tmp->users, 1, lim); rcu_read_unlock(); } static void tipc_aead_users_dec(struct tipc_aead __rcu *aead, int lim) { struct tipc_aead *tmp; rcu_read_lock(); tmp = rcu_dereference(aead); if (tmp) atomic_add_unless(&rcu_dereference(aead)->users, -1, lim); rcu_read_unlock(); } static void tipc_aead_users_set(struct tipc_aead __rcu *aead, int val) { struct tipc_aead *tmp; int cur; rcu_read_lock(); tmp = rcu_dereference(aead); if (tmp) { do { cur = atomic_read(&tmp->users); if (cur == val) break; } while (atomic_cmpxchg(&tmp->users, cur, val) != cur); } rcu_read_unlock(); } /** * tipc_aead_tfm_next - Move TFM entry to the next one in list and return it * @aead: the AEAD key pointer */ static struct crypto_aead *tipc_aead_tfm_next(struct tipc_aead *aead) { struct tipc_tfm **tfm_entry; struct crypto_aead *tfm; tfm_entry = get_cpu_ptr(aead->tfm_entry); *tfm_entry = list_next_entry(*tfm_entry, list); tfm = (*tfm_entry)->tfm; put_cpu_ptr(tfm_entry); return tfm; } /** * tipc_aead_init - Initiate TIPC AEAD * @aead: returned new TIPC AEAD key handle pointer * @ukey: pointer to user key data * @mode: the key mode * * Allocate a (list of) new cipher transformation (TFM) with the specific user * key data if valid. The number of the allocated TFMs can be set via the sysfs * "net/tipc/max_tfms" first. * Also, all the other AEAD data are also initialized. * * Return: 0 if the initiation is successful, otherwise: < 0 */ static int tipc_aead_init(struct tipc_aead **aead, struct tipc_aead_key *ukey, u8 mode) { struct tipc_tfm *tfm_entry, *head; struct crypto_aead *tfm; struct tipc_aead *tmp; int keylen, err, cpu; int tfm_cnt = 0; if (unlikely(*aead)) return -EEXIST; /* Allocate a new AEAD */ tmp = kzalloc(sizeof(*tmp), GFP_ATOMIC); if (unlikely(!tmp)) return -ENOMEM; /* The key consists of two parts: [AES-KEY][SALT] */ keylen = ukey->keylen - TIPC_AES_GCM_SALT_SIZE; /* Allocate per-cpu TFM entry pointer */ tmp->tfm_entry = alloc_percpu(struct tipc_tfm *); if (!tmp->tfm_entry) { kfree_sensitive(tmp); return -ENOMEM; } /* Make a list of TFMs with the user key data */ do { tfm = crypto_alloc_aead(ukey->alg_name, 0, 0); if (IS_ERR(tfm)) { err = PTR_ERR(tfm); break; } if (unlikely(!tfm_cnt && crypto_aead_ivsize(tfm) != TIPC_AES_GCM_IV_SIZE)) { crypto_free_aead(tfm); err = -ENOTSUPP; break; } err = crypto_aead_setauthsize(tfm, TIPC_AES_GCM_TAG_SIZE); err |= crypto_aead_setkey(tfm, ukey->key, keylen); if (unlikely(err)) { crypto_free_aead(tfm); break; } tfm_entry = kmalloc(sizeof(*tfm_entry), GFP_KERNEL); if (unlikely(!tfm_entry)) { crypto_free_aead(tfm); err = -ENOMEM; break; } INIT_LIST_HEAD(&tfm_entry->list); tfm_entry->tfm = tfm; /* First entry? */ if (!tfm_cnt) { head = tfm_entry; for_each_possible_cpu(cpu) { *per_cpu_ptr(tmp->tfm_entry, cpu) = head; } } else { list_add_tail(&tfm_entry->list, &head->list); } } while (++tfm_cnt < sysctl_tipc_max_tfms); /* Not any TFM is allocated? */ if (!tfm_cnt) { free_percpu(tmp->tfm_entry); kfree_sensitive(tmp); return err; } /* Form a hex string of some last bytes as the key's hint */ bin2hex(tmp->hint, ukey->key + keylen - TIPC_AEAD_HINT_LEN, TIPC_AEAD_HINT_LEN); /* Initialize the other data */ tmp->mode = mode; tmp->cloned = NULL; tmp->authsize = TIPC_AES_GCM_TAG_SIZE; tmp->key = kmemdup(ukey, tipc_aead_key_size(ukey), GFP_KERNEL); if (!tmp->key) { tipc_aead_free(&tmp->rcu); return -ENOMEM; } memcpy(&tmp->salt, ukey->key + keylen, TIPC_AES_GCM_SALT_SIZE); atomic_set(&tmp->users, 0); atomic64_set(&tmp->seqno, 0); refcount_set(&tmp->refcnt, 1); *aead = tmp; return 0; } /** * tipc_aead_clone - Clone a TIPC AEAD key * @dst: dest key for the cloning * @src: source key to clone from * * Make a "copy" of the source AEAD key data to the dest, the TFMs list is * common for the keys. * A reference to the source is hold in the "cloned" pointer for the later * freeing purposes. * * Note: this must be done in cluster-key mode only! * Return: 0 in case of success, otherwise < 0 */ static int tipc_aead_clone(struct tipc_aead **dst, struct tipc_aead *src) { struct tipc_aead *aead; int cpu; if (!src) return -ENOKEY; if (src->mode != CLUSTER_KEY) return -EINVAL; if (unlikely(*dst)) return -EEXIST; aead = kzalloc(sizeof(*aead), GFP_ATOMIC); if (unlikely(!aead)) return -ENOMEM; aead->tfm_entry = alloc_percpu_gfp(struct tipc_tfm *, GFP_ATOMIC); if (unlikely(!aead->tfm_entry)) { kfree_sensitive(aead); return -ENOMEM; } for_each_possible_cpu(cpu) { *per_cpu_ptr(aead->tfm_entry, cpu) = *per_cpu_ptr(src->tfm_entry, cpu); } memcpy(aead->hint, src->hint, sizeof(src->hint)); aead->mode = src->mode; aead->salt = src->salt; aead->authsize = src->authsize; atomic_set(&aead->users, 0); atomic64_set(&aead->seqno, 0); refcount_set(&aead->refcnt, 1); WARN_ON(!refcount_inc_not_zero(&src->refcnt)); aead->cloned = src; *dst = aead; return 0; } /** * tipc_aead_mem_alloc - Allocate memory for AEAD request operations * @tfm: cipher handle to be registered with the request * @crypto_ctx_size: size of crypto context for callback * @iv: returned pointer to IV data * @req: returned pointer to AEAD request data * @sg: returned pointer to SG lists * @nsg: number of SG lists to be allocated * * Allocate memory to store the crypto context data, AEAD request, IV and SG * lists, the memory layout is as follows: * crypto_ctx || iv || aead_req || sg[] * * Return: the pointer to the memory areas in case of success, otherwise NULL */ static void *tipc_aead_mem_alloc(struct crypto_aead *tfm, unsigned int crypto_ctx_size, u8 **iv, struct aead_request **req, struct scatterlist **sg, int nsg) { unsigned int iv_size, req_size; unsigned int len; u8 *mem; iv_size = crypto_aead_ivsize(tfm); req_size = sizeof(**req) + crypto_aead_reqsize(tfm); len = crypto_ctx_size; len += iv_size; len += crypto_aead_alignmask(tfm) & ~(crypto_tfm_ctx_alignment() - 1); len = ALIGN(len, crypto_tfm_ctx_alignment()); len += req_size; len = ALIGN(len, __alignof__(struct scatterlist)); len += nsg * sizeof(**sg); mem = kmalloc(len, GFP_ATOMIC); if (!mem) return NULL; *iv = (u8 *)PTR_ALIGN(mem + crypto_ctx_size, crypto_aead_alignmask(tfm) + 1); *req = (struct aead_request *)PTR_ALIGN(*iv + iv_size, crypto_tfm_ctx_alignment()); *sg = (struct scatterlist *)PTR_ALIGN((u8 *)*req + req_size, __alignof__(struct scatterlist)); return (void *)mem; } /** * tipc_aead_encrypt - Encrypt a message * @aead: TIPC AEAD key for the message encryption * @skb: the input/output skb * @b: TIPC bearer where the message will be delivered after the encryption * @dst: the destination media address * @__dnode: TIPC dest node if "known" * * Return: * * 0 : if the encryption has completed * * -EINPROGRESS/-EBUSY : if a callback will be performed * * < 0 : the encryption has failed */ static int tipc_aead_encrypt(struct tipc_aead *aead, struct sk_buff *skb, struct tipc_bearer *b, struct tipc_media_addr *dst, struct tipc_node *__dnode) { struct crypto_aead *tfm = tipc_aead_tfm_next(aead); struct tipc_crypto_tx_ctx *tx_ctx; struct aead_request *req; struct sk_buff *trailer; struct scatterlist *sg; struct tipc_ehdr *ehdr; int ehsz, len, tailen, nsg, rc; void *ctx; u32 salt; u8 *iv; /* Make sure message len at least 4-byte aligned */ len = ALIGN(skb->len, 4); tailen = len - skb->len + aead->authsize; /* Expand skb tail for authentication tag: * As for simplicity, we'd have made sure skb having enough tailroom * for authentication tag @skb allocation. Even when skb is nonlinear * but there is no frag_list, it should be still fine! * Otherwise, we must cow it to be a writable buffer with the tailroom. */ SKB_LINEAR_ASSERT(skb); if (tailen > skb_tailroom(skb)) { pr_debug("TX(): skb tailroom is not enough: %d, requires: %d\n", skb_tailroom(skb), tailen); } nsg = skb_cow_data(skb, tailen, &trailer); if (unlikely(nsg < 0)) { pr_err("TX: skb_cow_data() returned %d\n", nsg); return nsg; } pskb_put(skb, trailer, tailen); /* Allocate memory for the AEAD operation */ ctx = tipc_aead_mem_alloc(tfm, sizeof(*tx_ctx), &iv, &req, &sg, nsg); if (unlikely(!ctx)) return -ENOMEM; TIPC_SKB_CB(skb)->crypto_ctx = ctx; /* Map skb to the sg lists */ sg_init_table(sg, nsg); rc = skb_to_sgvec(skb, sg, 0, skb->len); if (unlikely(rc < 0)) { pr_err("TX: skb_to_sgvec() returned %d, nsg %d!\n", rc, nsg); goto exit; } /* Prepare IV: [SALT (4 octets)][SEQNO (8 octets)] * In case we're in cluster-key mode, SALT is varied by xor-ing with * the source address (or w0 of id), otherwise with the dest address * if dest is known. */ ehdr = (struct tipc_ehdr *)skb->data; salt = aead->salt; if (aead->mode == CLUSTER_KEY) salt ^= __be32_to_cpu(ehdr->addr); else if (__dnode) salt ^= tipc_node_get_addr(__dnode); memcpy(iv, &salt, 4); memcpy(iv + 4, (u8 *)&ehdr->seqno, 8); /* Prepare request */ ehsz = tipc_ehdr_size(ehdr); aead_request_set_tfm(req, tfm); aead_request_set_ad(req, ehsz); aead_request_set_crypt(req, sg, sg, len - ehsz, iv); /* Set callback function & data */ aead_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, tipc_aead_encrypt_done, skb); tx_ctx = (struct tipc_crypto_tx_ctx *)ctx; tx_ctx->aead = aead; tx_ctx->bearer = b; memcpy(&tx_ctx->dst, dst, sizeof(*dst)); /* Hold bearer */ if (unlikely(!tipc_bearer_hold(b))) { rc = -ENODEV; goto exit; } /* Now, do encrypt */ rc = crypto_aead_encrypt(req); if (rc == -EINPROGRESS || rc == -EBUSY) return rc; tipc_bearer_put(b); exit: kfree(ctx); TIPC_SKB_CB(skb)->crypto_ctx = NULL; return rc; } static void tipc_aead_encrypt_done(void *data, int err) { struct sk_buff *skb = data; struct tipc_crypto_tx_ctx *tx_ctx = TIPC_SKB_CB(skb)->crypto_ctx; struct tipc_bearer *b = tx_ctx->bearer; struct tipc_aead *aead = tx_ctx->aead; struct tipc_crypto *tx = aead->crypto; struct net *net = tx->net; switch (err) { case 0: this_cpu_inc(tx->stats->stat[STAT_ASYNC_OK]); rcu_read_lock(); if (likely(test_bit(0, &b->up))) b->media->send_msg(net, skb, b, &tx_ctx->dst); else kfree_skb(skb); rcu_read_unlock(); break; case -EINPROGRESS: return; default: this_cpu_inc(tx->stats->stat[STAT_ASYNC_NOK]); kfree_skb(skb); break; } kfree(tx_ctx); tipc_bearer_put(b); tipc_aead_put(aead); } /** * tipc_aead_decrypt - Decrypt an encrypted message * @net: struct net * @aead: TIPC AEAD for the message decryption * @skb: the input/output skb * @b: TIPC bearer where the message has been received * * Return: * * 0 : if the decryption has completed * * -EINPROGRESS/-EBUSY : if a callback will be performed * * < 0 : the decryption has failed */ static int tipc_aead_decrypt(struct net *net, struct tipc_aead *aead, struct sk_buff *skb, struct tipc_bearer *b) { struct tipc_crypto_rx_ctx *rx_ctx; struct aead_request *req; struct crypto_aead *tfm; struct sk_buff *unused; struct scatterlist *sg; struct tipc_ehdr *ehdr; int ehsz, nsg, rc; void *ctx; u32 salt; u8 *iv; if (unlikely(!aead)) return -ENOKEY; nsg = skb_cow_data(skb, 0, &unused); if (unlikely(nsg < 0)) { pr_err("RX: skb_cow_data() returned %d\n", nsg); return nsg; } /* Allocate memory for the AEAD operation */ tfm = tipc_aead_tfm_next(aead); ctx = tipc_aead_mem_alloc(tfm, sizeof(*rx_ctx), &iv, &req, &sg, nsg); if (unlikely(!ctx)) return -ENOMEM; TIPC_SKB_CB(skb)->crypto_ctx = ctx; /* Map skb to the sg lists */ sg_init_table(sg, nsg); rc = skb_to_sgvec(skb, sg, 0, skb->len); if (unlikely(rc < 0)) { pr_err("RX: skb_to_sgvec() returned %d, nsg %d\n", rc, nsg); goto exit; } /* Reconstruct IV: */ ehdr = (struct tipc_ehdr *)skb->data; salt = aead->salt; if (aead->mode == CLUSTER_KEY) salt ^= __be32_to_cpu(ehdr->addr); else if (ehdr->destined) salt ^= tipc_own_addr(net); memcpy(iv, &salt, 4); memcpy(iv + 4, (u8 *)&ehdr->seqno, 8); /* Prepare request */ ehsz = tipc_ehdr_size(ehdr); aead_request_set_tfm(req, tfm); aead_request_set_ad(req, ehsz); aead_request_set_crypt(req, sg, sg, skb->len - ehsz, iv); /* Set callback function & data */ aead_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, tipc_aead_decrypt_done, skb); rx_ctx = (struct tipc_crypto_rx_ctx *)ctx; rx_ctx->aead = aead; rx_ctx->bearer = b; /* Hold bearer */ if (unlikely(!tipc_bearer_hold(b))) { rc = -ENODEV; goto exit; } /* Now, do decrypt */ rc = crypto_aead_decrypt(req); if (rc == -EINPROGRESS || rc == -EBUSY) return rc; tipc_bearer_put(b); exit: kfree(ctx); TIPC_SKB_CB(skb)->crypto_ctx = NULL; return rc; } static void tipc_aead_decrypt_done(void *data, int err) { struct sk_buff *skb = data; struct tipc_crypto_rx_ctx *rx_ctx = TIPC_SKB_CB(skb)->crypto_ctx; struct tipc_bearer *b = rx_ctx->bearer; struct tipc_aead *aead = rx_ctx->aead; struct tipc_crypto_stats __percpu *stats = aead->crypto->stats; struct net *net = aead->crypto->net; switch (err) { case 0: this_cpu_inc(stats->stat[STAT_ASYNC_OK]); break; case -EINPROGRESS: return; default: this_cpu_inc(stats->stat[STAT_ASYNC_NOK]); break; } kfree(rx_ctx); tipc_crypto_rcv_complete(net, aead, b, &skb, err); if (likely(skb)) { if (likely(test_bit(0, &b->up))) tipc_rcv(net, skb, b); else kfree_skb(skb); } tipc_bearer_put(b); } static inline int tipc_ehdr_size(struct tipc_ehdr *ehdr) { return (ehdr->user != LINK_CONFIG) ? EHDR_SIZE : EHDR_CFG_SIZE; } /** * tipc_ehdr_validate - Validate an encryption message * @skb: the message buffer * * Return: "true" if this is a valid encryption message, otherwise "false" */ bool tipc_ehdr_validate(struct sk_buff *skb) { struct tipc_ehdr *ehdr; int ehsz; if (unlikely(!pskb_may_pull(skb, EHDR_MIN_SIZE))) return false; ehdr = (struct tipc_ehdr *)skb->data; if (unlikely(ehdr->version != TIPC_EVERSION)) return false; ehsz = tipc_ehdr_size(ehdr); if (unlikely(!pskb_may_pull(skb, ehsz))) return false; if (unlikely(skb->len <= ehsz + TIPC_AES_GCM_TAG_SIZE)) return false; return true; } /** * tipc_ehdr_build - Build TIPC encryption message header * @net: struct net * @aead: TX AEAD key to be used for the message encryption * @tx_key: key id used for the message encryption * @skb: input/output message skb * @__rx: RX crypto handle if dest is "known" * * Return: the header size if the building is successful, otherwise < 0 */ static int tipc_ehdr_build(struct net *net, struct tipc_aead *aead, u8 tx_key, struct sk_buff *skb, struct tipc_crypto *__rx) { struct tipc_msg *hdr = buf_msg(skb); struct tipc_ehdr *ehdr; u32 user = msg_user(hdr); u64 seqno; int ehsz; /* Make room for encryption header */ ehsz = (user != LINK_CONFIG) ? EHDR_SIZE : EHDR_CFG_SIZE; WARN_ON(skb_headroom(skb) < ehsz); ehdr = (struct tipc_ehdr *)skb_push(skb, ehsz); /* Obtain a seqno first: * Use the key seqno (= cluster wise) if dest is unknown or we're in * cluster key mode, otherwise it's better for a per-peer seqno! */ if (!__rx || aead->mode == CLUSTER_KEY) seqno = atomic64_inc_return(&aead->seqno); else seqno = atomic64_inc_return(&__rx->sndnxt); /* Revoke the key if seqno is wrapped around */ if (unlikely(!seqno)) return tipc_crypto_key_revoke(net, tx_key); /* Word 1-2 */ ehdr->seqno = cpu_to_be64(seqno); /* Words 0, 3- */ ehdr->version = TIPC_EVERSION; ehdr->user = 0; ehdr->keepalive = 0; ehdr->tx_key = tx_key; ehdr->destined = (__rx) ? 1 : 0; ehdr->rx_key_active = (__rx) ? __rx->key.active : 0; ehdr->rx_nokey = (__rx) ? __rx->nokey : 0; ehdr->master_key = aead->crypto->key_master; ehdr->reserved_1 = 0; ehdr->reserved_2 = 0; switch (user) { case LINK_CONFIG: ehdr->user = LINK_CONFIG; memcpy(ehdr->id, tipc_own_id(net), NODE_ID_LEN); break; default: if (user == LINK_PROTOCOL && msg_type(hdr) == STATE_MSG) { ehdr->user = LINK_PROTOCOL; ehdr->keepalive = msg_is_keepalive(hdr); } ehdr->addr = hdr->hdr[3]; break; } return ehsz; } static inline void tipc_crypto_key_set_state(struct tipc_crypto *c, u8 new_passive, u8 new_active, u8 new_pending) { struct tipc_key old = c->key; char buf[32]; c->key.keys = ((new_passive & KEY_MASK) << (KEY_BITS * 2)) | ((new_active & KEY_MASK) << (KEY_BITS)) | ((new_pending & KEY_MASK)); pr_debug("%s: key changing %s ::%pS\n", c->name, tipc_key_change_dump(old, c->key, buf), __builtin_return_address(0)); } /** * tipc_crypto_key_init - Initiate a new user / AEAD key * @c: TIPC crypto to which new key is attached * @ukey: the user key * @mode: the key mode (CLUSTER_KEY or PER_NODE_KEY) * @master_key: specify this is a cluster master key * * A new TIPC AEAD key will be allocated and initiated with the specified user * key, then attached to the TIPC crypto. * * Return: new key id in case of success, otherwise: < 0 */ int tipc_crypto_key_init(struct tipc_crypto *c, struct tipc_aead_key *ukey, u8 mode, bool master_key) { struct tipc_aead *aead = NULL; int rc = 0; /* Initiate with the new user key */ rc = tipc_aead_init(&aead, ukey, mode); /* Attach it to the crypto */ if (likely(!rc)) { rc = tipc_crypto_key_attach(c, aead, 0, master_key); if (rc < 0) tipc_aead_free(&aead->rcu); } return rc; } /** * tipc_crypto_key_attach - Attach a new AEAD key to TIPC crypto * @c: TIPC crypto to which the new AEAD key is attached * @aead: the new AEAD key pointer * @pos: desired slot in the crypto key array, = 0 if any! * @master_key: specify this is a cluster master key * * Return: new key id in case of success, otherwise: -EBUSY */ static int tipc_crypto_key_attach(struct tipc_crypto *c, struct tipc_aead *aead, u8 pos, bool master_key) { struct tipc_key key; int rc = -EBUSY; u8 new_key; spin_lock_bh(&c->lock); key = c->key; if (master_key) { new_key = KEY_MASTER; goto attach; } if (key.active && key.passive) goto exit; if (key.pending) { if (tipc_aead_users(c->aead[key.pending]) > 0) goto exit; /* if (pos): ok with replacing, will be aligned when needed */ /* Replace it */ new_key = key.pending; } else { if (pos) { if (key.active && pos != key_next(key.active)) { key.passive = pos; new_key = pos; goto attach; } else if (!key.active && !key.passive) { key.pending = pos; new_key = pos; goto attach; } } key.pending = key_next(key.active ?: key.passive); new_key = key.pending; } attach: aead->crypto = c; aead->gen = (is_tx(c)) ? ++c->key_gen : c->key_gen; tipc_aead_rcu_replace(c->aead[new_key], aead, &c->lock); if (likely(c->key.keys != key.keys)) tipc_crypto_key_set_state(c, key.passive, key.active, key.pending); c->working = 1; c->nokey = 0; c->key_master |= master_key; rc = new_key; exit: spin_unlock_bh(&c->lock); return rc; } void tipc_crypto_key_flush(struct tipc_crypto *c) { struct tipc_crypto *tx, *rx; int k; spin_lock_bh(&c->lock); if (is_rx(c)) { /* Try to cancel pending work */ rx = c; tx = tipc_net(rx->net)->crypto_tx; if (cancel_delayed_work(&rx->work)) { kfree(rx->skey); rx->skey = NULL; atomic_xchg(&rx->key_distr, 0); tipc_node_put(rx->node); } /* RX stopping => decrease TX key users if any */ k = atomic_xchg(&rx->peer_rx_active, 0); if (k) { tipc_aead_users_dec(tx->aead[k], 0); /* Mark the point TX key users changed */ tx->timer1 = jiffies; } } c->flags = 0; tipc_crypto_key_set_state(c, 0, 0, 0); for (k = KEY_MIN; k <= KEY_MAX; k++) tipc_crypto_key_detach(c->aead[k], &c->lock); atomic64_set(&c->sndnxt, 0); spin_unlock_bh(&c->lock); } /** * tipc_crypto_key_try_align - Align RX keys if possible * @rx: RX crypto handle * @new_pending: new pending slot if aligned (= TX key from peer) * * Peer has used an unknown key slot, this only happens when peer has left and * rejoned, or we are newcomer. * That means, there must be no active key but a pending key at unaligned slot. * If so, we try to move the pending key to the new slot. * Note: A potential passive key can exist, it will be shifted correspondingly! * * Return: "true" if key is successfully aligned, otherwise "false" */ static bool tipc_crypto_key_try_align(struct tipc_crypto *rx, u8 new_pending) { struct tipc_aead *tmp1, *tmp2 = NULL; struct tipc_key key; bool aligned = false; u8 new_passive = 0; int x; spin_lock(&rx->lock); key = rx->key; if (key.pending == new_pending) { aligned = true; goto exit; } if (key.active) goto exit; if (!key.pending) goto exit; if (tipc_aead_users(rx->aead[key.pending]) > 0) goto exit; /* Try to "isolate" this pending key first */ tmp1 = tipc_aead_rcu_ptr(rx->aead[key.pending], &rx->lock); if (!refcount_dec_if_one(&tmp1->refcnt)) goto exit; rcu_assign_pointer(rx->aead[key.pending], NULL); /* Move passive key if any */ if (key.passive) { tmp2 = rcu_replace_pointer(rx->aead[key.passive], tmp2, lockdep_is_held(&rx->lock)); x = (key.passive - key.pending + new_pending) % KEY_MAX; new_passive = (x <= 0) ? x + KEY_MAX : x; } /* Re-allocate the key(s) */ tipc_crypto_key_set_state(rx, new_passive, 0, new_pending); rcu_assign_pointer(rx->aead[new_pending], tmp1); if (new_passive) rcu_assign_pointer(rx->aead[new_passive], tmp2); refcount_set(&tmp1->refcnt, 1); aligned = true; pr_info_ratelimited("%s: key[%d] -> key[%d]\n", rx->name, key.pending, new_pending); exit: spin_unlock(&rx->lock); return aligned; } /** * tipc_crypto_key_pick_tx - Pick one TX key for message decryption * @tx: TX crypto handle * @rx: RX crypto handle (can be NULL) * @skb: the message skb which will be decrypted later * @tx_key: peer TX key id * * This function looks up the existing TX keys and pick one which is suitable * for the message decryption, that must be a cluster key and not used before * on the same message (i.e. recursive). * * Return: the TX AEAD key handle in case of success, otherwise NULL */ static struct tipc_aead *tipc_crypto_key_pick_tx(struct tipc_crypto *tx, struct tipc_crypto *rx, struct sk_buff *skb, u8 tx_key) { struct tipc_skb_cb *skb_cb = TIPC_SKB_CB(skb); struct tipc_aead *aead = NULL; struct tipc_key key = tx->key; u8 k, i = 0; /* Initialize data if not yet */ if (!skb_cb->tx_clone_deferred) { skb_cb->tx_clone_deferred = 1; memset(&skb_cb->tx_clone_ctx, 0, sizeof(skb_cb->tx_clone_ctx)); } skb_cb->tx_clone_ctx.rx = rx; if (++skb_cb->tx_clone_ctx.recurs > 2) return NULL; /* Pick one TX key */ spin_lock(&tx->lock); if (tx_key == KEY_MASTER) { aead = tipc_aead_rcu_ptr(tx->aead[KEY_MASTER], &tx->lock); goto done; } do { k = (i == 0) ? key.pending : ((i == 1) ? key.active : key.passive); if (!k) continue; aead = tipc_aead_rcu_ptr(tx->aead[k], &tx->lock); if (!aead) continue; if (aead->mode != CLUSTER_KEY || aead == skb_cb->tx_clone_ctx.last) { aead = NULL; continue; } /* Ok, found one cluster key */ skb_cb->tx_clone_ctx.last = aead; WARN_ON(skb->next); skb->next = skb_clone(skb, GFP_ATOMIC); if (unlikely(!skb->next)) pr_warn("Failed to clone skb for next round if any\n"); break; } while (++i < 3); done: if (likely(aead)) WARN_ON(!refcount_inc_not_zero(&aead->refcnt)); spin_unlock(&tx->lock); return aead; } /** * tipc_crypto_key_synch: Synch own key data according to peer key status * @rx: RX crypto handle * @skb: TIPCv2 message buffer (incl. the ehdr from peer) * * This function updates the peer node related data as the peer RX active key * has changed, so the number of TX keys' users on this node are increased and * decreased correspondingly. * * It also considers if peer has no key, then we need to make own master key * (if any) taking over i.e. starting grace period and also trigger key * distributing process. * * The "per-peer" sndnxt is also reset when the peer key has switched. */ static void tipc_crypto_key_synch(struct tipc_crypto *rx, struct sk_buff *skb) { struct tipc_ehdr *ehdr = (struct tipc_ehdr *)skb_network_header(skb); struct tipc_crypto *tx = tipc_net(rx->net)->crypto_tx; struct tipc_msg *hdr = buf_msg(skb); u32 self = tipc_own_addr(rx->net); u8 cur, new; unsigned long delay; /* Update RX 'key_master' flag according to peer, also mark "legacy" if * a peer has no master key. */ rx->key_master = ehdr->master_key; if (!rx->key_master) tx->legacy_user = 1; /* For later cases, apply only if message is destined to this node */ if (!ehdr->destined || msg_short(hdr) || msg_destnode(hdr) != self) return; /* Case 1: Peer has no keys, let's make master key take over */ if (ehdr->rx_nokey) { /* Set or extend grace period */ tx->timer2 = jiffies; /* Schedule key distributing for the peer if not yet */ if (tx->key.keys && !atomic_cmpxchg(&rx->key_distr, 0, KEY_DISTR_SCHED)) { get_random_bytes(&delay, 2); delay %= 5; delay = msecs_to_jiffies(500 * ++delay); if (queue_delayed_work(tx->wq, &rx->work, delay)) tipc_node_get(rx->node); } } else { /* Cancel a pending key distributing if any */ atomic_xchg(&rx->key_distr, 0); } /* Case 2: Peer RX active key has changed, let's update own TX users */ cur = atomic_read(&rx->peer_rx_active); new = ehdr->rx_key_active; if (tx->key.keys && cur != new && atomic_cmpxchg(&rx->peer_rx_active, cur, new) == cur) { if (new) tipc_aead_users_inc(tx->aead[new], INT_MAX); if (cur) tipc_aead_users_dec(tx->aead[cur], 0); atomic64_set(&rx->sndnxt, 0); /* Mark the point TX key users changed */ tx->timer1 = jiffies; pr_debug("%s: key users changed %d-- %d++, peer %s\n", tx->name, cur, new, rx->name); } } static int tipc_crypto_key_revoke(struct net *net, u8 tx_key) { struct tipc_crypto *tx = tipc_net(net)->crypto_tx; struct tipc_key key; spin_lock_bh(&tx->lock); key = tx->key; WARN_ON(!key.active || tx_key != key.active); /* Free the active key */ tipc_crypto_key_set_state(tx, key.passive, 0, key.pending); tipc_crypto_key_detach(tx->aead[key.active], &tx->lock); spin_unlock_bh(&tx->lock); pr_warn("%s: key is revoked\n", tx->name); return -EKEYREVOKED; } int tipc_crypto_start(struct tipc_crypto **crypto, struct net *net, struct tipc_node *node) { struct tipc_crypto *c; if (*crypto) return -EEXIST; /* Allocate crypto */ c = kzalloc(sizeof(*c), GFP_ATOMIC); if (!c) return -ENOMEM; /* Allocate workqueue on TX */ if (!node) { c->wq = alloc_ordered_workqueue("tipc_crypto", 0); if (!c->wq) { kfree(c); return -ENOMEM; } } /* Allocate statistic structure */ c->stats = alloc_percpu_gfp(struct tipc_crypto_stats, GFP_ATOMIC); if (!c->stats) { if (c->wq) destroy_workqueue(c->wq); kfree_sensitive(c); return -ENOMEM; } c->flags = 0; c->net = net; c->node = node; get_random_bytes(&c->key_gen, 2); tipc_crypto_key_set_state(c, 0, 0, 0); atomic_set(&c->key_distr, 0); atomic_set(&c->peer_rx_active, 0); atomic64_set(&c->sndnxt, 0); c->timer1 = jiffies; c->timer2 = jiffies; c->rekeying_intv = TIPC_REKEYING_INTV_DEF; spin_lock_init(&c->lock); scnprintf(c->name, 48, "%s(%s)", (is_rx(c)) ? "RX" : "TX", (is_rx(c)) ? tipc_node_get_id_str(c->node) : tipc_own_id_string(c->net)); if (is_rx(c)) INIT_DELAYED_WORK(&c->work, tipc_crypto_work_rx); else INIT_DELAYED_WORK(&c->work, tipc_crypto_work_tx); *crypto = c; return 0; } void tipc_crypto_stop(struct tipc_crypto **crypto) { struct tipc_crypto *c = *crypto; u8 k; if (!c) return; /* Flush any queued works & destroy wq */ if (is_tx(c)) { c->rekeying_intv = 0; cancel_delayed_work_sync(&c->work); destroy_workqueue(c->wq); } /* Release AEAD keys */ rcu_read_lock(); for (k = KEY_MIN; k <= KEY_MAX; k++) tipc_aead_put(rcu_dereference(c->aead[k])); rcu_read_unlock(); pr_debug("%s: has been stopped\n", c->name); /* Free this crypto statistics */ free_percpu(c->stats); *crypto = NULL; kfree_sensitive(c); } void tipc_crypto_timeout(struct tipc_crypto *rx) { struct tipc_net *tn = tipc_net(rx->net); struct tipc_crypto *tx = tn->crypto_tx; struct tipc_key key; int cmd; /* TX pending: taking all users & stable -> active */ spin_lock(&tx->lock); key = tx->key; if (key.active && tipc_aead_users(tx->aead[key.active]) > 0) goto s1; if (!key.pending || tipc_aead_users(tx->aead[key.pending]) <= 0) goto s1; if (time_before(jiffies, tx->timer1 + TIPC_TX_LASTING_TIME)) goto s1; tipc_crypto_key_set_state(tx, key.passive, key.pending, 0); if (key.active) tipc_crypto_key_detach(tx->aead[key.active], &tx->lock); this_cpu_inc(tx->stats->stat[STAT_SWITCHES]); pr_info("%s: key[%d] is activated\n", tx->name, key.pending); s1: spin_unlock(&tx->lock); /* RX pending: having user -> active */ spin_lock(&rx->lock); key = rx->key; if (!key.pending || tipc_aead_users(rx->aead[key.pending]) <= 0) goto s2; if (key.active) key.passive = key.active; key.active = key.pending; rx->timer2 = jiffies; tipc_crypto_key_set_state(rx, key.passive, key.active, 0); this_cpu_inc(rx->stats->stat[STAT_SWITCHES]); pr_info("%s: key[%d] is activated\n", rx->name, key.pending); goto s5; s2: /* RX pending: not working -> remove */ if (!key.pending || tipc_aead_users(rx->aead[key.pending]) > -10) goto s3; tipc_crypto_key_set_state(rx, key.passive, key.active, 0); tipc_crypto_key_detach(rx->aead[key.pending], &rx->lock); pr_debug("%s: key[%d] is removed\n", rx->name, key.pending); goto s5; s3: /* RX active: timed out or no user -> pending */ if (!key.active) goto s4; if (time_before(jiffies, rx->timer1 + TIPC_RX_ACTIVE_LIM) && tipc_aead_users(rx->aead[key.active]) > 0) goto s4; if (key.pending) key.passive = key.active; else key.pending = key.active; rx->timer2 = jiffies; tipc_crypto_key_set_state(rx, key.passive, 0, key.pending); tipc_aead_users_set(rx->aead[key.pending], 0); pr_debug("%s: key[%d] is deactivated\n", rx->name, key.active); goto s5; s4: /* RX passive: outdated or not working -> free */ if (!key.passive) goto s5; if (time_before(jiffies, rx->timer2 + TIPC_RX_PASSIVE_LIM) && tipc_aead_users(rx->aead[key.passive]) > -10) goto s5; tipc_crypto_key_set_state(rx, 0, key.active, key.pending); tipc_crypto_key_detach(rx->aead[key.passive], &rx->lock); pr_debug("%s: key[%d] is freed\n", rx->name, key.passive); s5: spin_unlock(&rx->lock); /* Relax it here, the flag will be set again if it really is, but only * when we are not in grace period for safety! */ if (time_after(jiffies, tx->timer2 + TIPC_TX_GRACE_PERIOD)) tx->legacy_user = 0; /* Limit max_tfms & do debug commands if needed */ if (likely(sysctl_tipc_max_tfms <= TIPC_MAX_TFMS_LIM)) return; cmd = sysctl_tipc_max_tfms; sysctl_tipc_max_tfms = TIPC_MAX_TFMS_DEF; tipc_crypto_do_cmd(rx->net, cmd); } static inline void tipc_crypto_clone_msg(struct net *net, struct sk_buff *_skb, struct tipc_bearer *b, struct tipc_media_addr *dst, struct tipc_node *__dnode, u8 type) { struct sk_buff *skb; skb = skb_clone(_skb, GFP_ATOMIC); if (skb) { TIPC_SKB_CB(skb)->xmit_type = type; tipc_crypto_xmit(net, &skb, b, dst, __dnode); if (skb) b->media->send_msg(net, skb, b, dst); } } /** * tipc_crypto_xmit - Build & encrypt TIPC message for xmit * @net: struct net * @skb: input/output message skb pointer * @b: bearer used for xmit later * @dst: destination media address * @__dnode: destination node for reference if any * * First, build an encryption message header on the top of the message, then * encrypt the original TIPC message by using the pending, master or active * key with this preference order. * If the encryption is successful, the encrypted skb is returned directly or * via the callback. * Otherwise, the skb is freed! * * Return: * * 0 : the encryption has succeeded (or no encryption) * * -EINPROGRESS/-EBUSY : the encryption is ongoing, a callback will be made * * -ENOKEK : the encryption has failed due to no key * * -EKEYREVOKED : the encryption has failed due to key revoked * * -ENOMEM : the encryption has failed due to no memory * * < 0 : the encryption has failed due to other reasons */ int tipc_crypto_xmit(struct net *net, struct sk_buff **skb, struct tipc_bearer *b, struct tipc_media_addr *dst, struct tipc_node *__dnode) { struct tipc_crypto *__rx = tipc_node_crypto_rx(__dnode); struct tipc_crypto *tx = tipc_net(net)->crypto_tx; struct tipc_crypto_stats __percpu *stats = tx->stats; struct tipc_msg *hdr = buf_msg(*skb); struct tipc_key key = tx->key; struct tipc_aead *aead = NULL; u32 user = msg_user(hdr); u32 type = msg_type(hdr); int rc = -ENOKEY; u8 tx_key = 0; /* No encryption? */ if (!tx->working) return 0; /* Pending key if peer has active on it or probing time */ if (unlikely(key.pending)) { tx_key = key.pending; if (!tx->key_master && !key.active) goto encrypt; if (__rx && atomic_read(&__rx->peer_rx_active) == tx_key) goto encrypt; if (TIPC_SKB_CB(*skb)->xmit_type == SKB_PROBING) { pr_debug("%s: probing for key[%d]\n", tx->name, key.pending); goto encrypt; } if (user == LINK_CONFIG || user == LINK_PROTOCOL) tipc_crypto_clone_msg(net, *skb, b, dst, __dnode, SKB_PROBING); } /* Master key if this is a *vital* message or in grace period */ if (tx->key_master) { tx_key = KEY_MASTER; if (!key.active) goto encrypt; if (TIPC_SKB_CB(*skb)->xmit_type == SKB_GRACING) { pr_debug("%s: gracing for msg (%d %d)\n", tx->name, user, type); goto encrypt; } if (user == LINK_CONFIG || (user == LINK_PROTOCOL && type == RESET_MSG) || (user == MSG_CRYPTO && type == KEY_DISTR_MSG) || time_before(jiffies, tx->timer2 + TIPC_TX_GRACE_PERIOD)) { if (__rx && __rx->key_master && !atomic_read(&__rx->peer_rx_active)) goto encrypt; if (!__rx) { if (likely(!tx->legacy_user)) goto encrypt; tipc_crypto_clone_msg(net, *skb, b, dst, __dnode, SKB_GRACING); } } } /* Else, use the active key if any */ if (likely(key.active)) { tx_key = key.active; goto encrypt; } goto exit; encrypt: aead = tipc_aead_get(tx->aead[tx_key]); if (unlikely(!aead)) goto exit; rc = tipc_ehdr_build(net, aead, tx_key, *skb, __rx); if (likely(rc > 0)) rc = tipc_aead_encrypt(aead, *skb, b, dst, __dnode); exit: switch (rc) { case 0: this_cpu_inc(stats->stat[STAT_OK]); break; case -EINPROGRESS: case -EBUSY: this_cpu_inc(stats->stat[STAT_ASYNC]); *skb = NULL; return rc; default: this_cpu_inc(stats->stat[STAT_NOK]); if (rc == -ENOKEY) this_cpu_inc(stats->stat[STAT_NOKEYS]); else if (rc == -EKEYREVOKED) this_cpu_inc(stats->stat[STAT_BADKEYS]); kfree_skb(*skb); *skb = NULL; break; } tipc_aead_put(aead); return rc; } /** * tipc_crypto_rcv - Decrypt an encrypted TIPC message from peer * @net: struct net * @rx: RX crypto handle * @skb: input/output message skb pointer * @b: bearer where the message has been received * * If the decryption is successful, the decrypted skb is returned directly or * as the callback, the encryption header and auth tag will be trimed out * before forwarding to tipc_rcv() via the tipc_crypto_rcv_complete(). * Otherwise, the skb will be freed! * Note: RX key(s) can be re-aligned, or in case of no key suitable, TX * cluster key(s) can be taken for decryption (- recursive). * * Return: * * 0 : the decryption has successfully completed * * -EINPROGRESS/-EBUSY : the decryption is ongoing, a callback will be made * * -ENOKEY : the decryption has failed due to no key * * -EBADMSG : the decryption has failed due to bad message * * -ENOMEM : the decryption has failed due to no memory * * < 0 : the decryption has failed due to other reasons */ int tipc_crypto_rcv(struct net *net, struct tipc_crypto *rx, struct sk_buff **skb, struct tipc_bearer *b) { struct tipc_crypto *tx = tipc_net(net)->crypto_tx; struct tipc_crypto_stats __percpu *stats; struct tipc_aead *aead = NULL; struct tipc_key key; int rc = -ENOKEY; u8 tx_key, n; tx_key = ((struct tipc_ehdr *)(*skb)->data)->tx_key; /* New peer? * Let's try with TX key (i.e. cluster mode) & verify the skb first! */ if (unlikely(!rx || tx_key == KEY_MASTER)) goto pick_tx; /* Pick RX key according to TX key if any */ key = rx->key; if (tx_key == key.active || tx_key == key.pending || tx_key == key.passive) goto decrypt; /* Unknown key, let's try to align RX key(s) */ if (tipc_crypto_key_try_align(rx, tx_key)) goto decrypt; pick_tx: /* No key suitable? Try to pick one from TX... */ aead = tipc_crypto_key_pick_tx(tx, rx, *skb, tx_key); if (aead) goto decrypt; goto exit; decrypt: rcu_read_lock(); if (!aead) aead = tipc_aead_get(rx->aead[tx_key]); rc = tipc_aead_decrypt(net, aead, *skb, b); rcu_read_unlock(); exit: stats = ((rx) ?: tx)->stats; switch (rc) { case 0: this_cpu_inc(stats->stat[STAT_OK]); break; case -EINPROGRESS: case -EBUSY: this_cpu_inc(stats->stat[STAT_ASYNC]); *skb = NULL; return rc; default: this_cpu_inc(stats->stat[STAT_NOK]); if (rc == -ENOKEY) { kfree_skb(*skb); *skb = NULL; if (rx) { /* Mark rx->nokey only if we dont have a * pending received session key, nor a newer * one i.e. in the next slot. */ n = key_next(tx_key); rx->nokey = !(rx->skey || rcu_access_pointer(rx->aead[n])); pr_debug_ratelimited("%s: nokey %d, key %d/%x\n", rx->name, rx->nokey, tx_key, rx->key.keys); tipc_node_put(rx->node); } this_cpu_inc(stats->stat[STAT_NOKEYS]); return rc; } else if (rc == -EBADMSG) { this_cpu_inc(stats->stat[STAT_BADMSGS]); } break; } tipc_crypto_rcv_complete(net, aead, b, skb, rc); return rc; } static void tipc_crypto_rcv_complete(struct net *net, struct tipc_aead *aead, struct tipc_bearer *b, struct sk_buff **skb, int err) { struct tipc_skb_cb *skb_cb = TIPC_SKB_CB(*skb); struct tipc_crypto *rx = aead->crypto; struct tipc_aead *tmp = NULL; struct tipc_ehdr *ehdr; struct tipc_node *n; /* Is this completed by TX? */ if (unlikely(is_tx(aead->crypto))) { rx = skb_cb->tx_clone_ctx.rx; pr_debug("TX->RX(%s): err %d, aead %p, skb->next %p, flags %x\n", (rx) ? tipc_node_get_id_str(rx->node) : "-", err, aead, (*skb)->next, skb_cb->flags); pr_debug("skb_cb [recurs %d, last %p], tx->aead [%p %p %p]\n", skb_cb->tx_clone_ctx.recurs, skb_cb->tx_clone_ctx.last, aead->crypto->aead[1], aead->crypto->aead[2], aead->crypto->aead[3]); if (unlikely(err)) { if (err == -EBADMSG && (*skb)->next) tipc_rcv(net, (*skb)->next, b); goto free_skb; } if (likely((*skb)->next)) { kfree_skb((*skb)->next); (*skb)->next = NULL; } ehdr = (struct tipc_ehdr *)(*skb)->data; if (!rx) { WARN_ON(ehdr->user != LINK_CONFIG); n = tipc_node_create(net, 0, ehdr->id, 0xffffu, 0, true); rx = tipc_node_crypto_rx(n); if (unlikely(!rx)) goto free_skb; } /* Ignore cloning if it was TX master key */ if (ehdr->tx_key == KEY_MASTER) goto rcv; if (tipc_aead_clone(&tmp, aead) < 0) goto rcv; WARN_ON(!refcount_inc_not_zero(&tmp->refcnt)); if (tipc_crypto_key_attach(rx, tmp, ehdr->tx_key, false) < 0) { tipc_aead_free(&tmp->rcu); goto rcv; } tipc_aead_put(aead); aead = tmp; } if (unlikely(err)) { tipc_aead_users_dec((struct tipc_aead __force __rcu *)aead, INT_MIN); goto free_skb; } /* Set the RX key's user */ tipc_aead_users_set((struct tipc_aead __force __rcu *)aead, 1); /* Mark this point, RX works */ rx->timer1 = jiffies; rcv: /* Remove ehdr & auth. tag prior to tipc_rcv() */ ehdr = (struct tipc_ehdr *)(*skb)->data; /* Mark this point, RX passive still works */ if (rx->key.passive && ehdr->tx_key == rx->key.passive) rx->timer2 = jiffies; skb_reset_network_header(*skb); skb_pull(*skb, tipc_ehdr_size(ehdr)); if (pskb_trim(*skb, (*skb)->len - aead->authsize)) goto free_skb; /* Validate TIPCv2 message */ if (unlikely(!tipc_msg_validate(skb))) { pr_err_ratelimited("Packet dropped after decryption!\n"); goto free_skb; } /* Ok, everything's fine, try to synch own keys according to peers' */ tipc_crypto_key_synch(rx, *skb); /* Re-fetch skb cb as skb might be changed in tipc_msg_validate */ skb_cb = TIPC_SKB_CB(*skb); /* Mark skb decrypted */ skb_cb->decrypted = 1; /* Clear clone cxt if any */ if (likely(!skb_cb->tx_clone_deferred)) goto exit; skb_cb->tx_clone_deferred = 0; memset(&skb_cb->tx_clone_ctx, 0, sizeof(skb_cb->tx_clone_ctx)); goto exit; free_skb: kfree_skb(*skb); *skb = NULL; exit: tipc_aead_put(aead); if (rx) tipc_node_put(rx->node); } static void tipc_crypto_do_cmd(struct net *net, int cmd) { struct tipc_net *tn = tipc_net(net); struct tipc_crypto *tx = tn->crypto_tx, *rx; struct list_head *p; unsigned int stat; int i, j, cpu; char buf[200]; /* Currently only one command is supported */ switch (cmd) { case 0xfff1: goto print_stats; default: return; } print_stats: /* Print a header */ pr_info("\n=============== TIPC Crypto Statistics ===============\n\n"); /* Print key status */ pr_info("Key status:\n"); pr_info("TX(%7.7s)\n%s", tipc_own_id_string(net), tipc_crypto_key_dump(tx, buf)); rcu_read_lock(); for (p = tn->node_list.next; p != &tn->node_list; p = p->next) { rx = tipc_node_crypto_rx_by_list(p); pr_info("RX(%7.7s)\n%s", tipc_node_get_id_str(rx->node), tipc_crypto_key_dump(rx, buf)); } rcu_read_unlock(); /* Print crypto statistics */ for (i = 0, j = 0; i < MAX_STATS; i++) j += scnprintf(buf + j, 200 - j, "|%11s ", hstats[i]); pr_info("Counter %s", buf); memset(buf, '-', 115); buf[115] = '\0'; pr_info("%s\n", buf); j = scnprintf(buf, 200, "TX(%7.7s) ", tipc_own_id_string(net)); for_each_possible_cpu(cpu) { for (i = 0; i < MAX_STATS; i++) { stat = per_cpu_ptr(tx->stats, cpu)->stat[i]; j += scnprintf(buf + j, 200 - j, "|%11d ", stat); } pr_info("%s", buf); j = scnprintf(buf, 200, "%12s", " "); } rcu_read_lock(); for (p = tn->node_list.next; p != &tn->node_list; p = p->next) { rx = tipc_node_crypto_rx_by_list(p); j = scnprintf(buf, 200, "RX(%7.7s) ", tipc_node_get_id_str(rx->node)); for_each_possible_cpu(cpu) { for (i = 0; i < MAX_STATS; i++) { stat = per_cpu_ptr(rx->stats, cpu)->stat[i]; j += scnprintf(buf + j, 200 - j, "|%11d ", stat); } pr_info("%s", buf); j = scnprintf(buf, 200, "%12s", " "); } } rcu_read_unlock(); pr_info("\n======================== Done ========================\n"); } static char *tipc_crypto_key_dump(struct tipc_crypto *c, char *buf) { struct tipc_key key = c->key; struct tipc_aead *aead; int k, i = 0; char *s; for (k = KEY_MIN; k <= KEY_MAX; k++) { if (k == KEY_MASTER) { if (is_rx(c)) continue; if (time_before(jiffies, c->timer2 + TIPC_TX_GRACE_PERIOD)) s = "ACT"; else s = "PAS"; } else { if (k == key.passive) s = "PAS"; else if (k == key.active) s = "ACT"; else if (k == key.pending) s = "PEN"; else s = "-"; } i += scnprintf(buf + i, 200 - i, "\tKey%d: %s", k, s); rcu_read_lock(); aead = rcu_dereference(c->aead[k]); if (aead) i += scnprintf(buf + i, 200 - i, "{\"0x...%s\", \"%s\"}/%d:%d", aead->hint, (aead->mode == CLUSTER_KEY) ? "c" : "p", atomic_read(&aead->users), refcount_read(&aead->refcnt)); rcu_read_unlock(); i += scnprintf(buf + i, 200 - i, "\n"); } if (is_rx(c)) i += scnprintf(buf + i, 200 - i, "\tPeer RX active: %d\n", atomic_read(&c->peer_rx_active)); return buf; } static char *tipc_key_change_dump(struct tipc_key old, struct tipc_key new, char *buf) { struct tipc_key *key = &old; int k, i = 0; char *s; /* Output format: "[%s %s %s] -> [%s %s %s]", max len = 32 */ again: i += scnprintf(buf + i, 32 - i, "["); for (k = KEY_1; k <= KEY_3; k++) { if (k == key->passive) s = "pas"; else if (k == key->active) s = "act"; else if (k == key->pending) s = "pen"; else s = "-"; i += scnprintf(buf + i, 32 - i, (k != KEY_3) ? "%s " : "%s", s); } if (key != &new) { i += scnprintf(buf + i, 32 - i, "] -> "); key = &new; goto again; } i += scnprintf(buf + i, 32 - i, "]"); return buf; } /** * tipc_crypto_msg_rcv - Common 'MSG_CRYPTO' processing point * @net: the struct net * @skb: the receiving message buffer */ void tipc_crypto_msg_rcv(struct net *net, struct sk_buff *skb) { struct tipc_crypto *rx; struct tipc_msg *hdr; if (unlikely(skb_linearize(skb))) goto exit; hdr = buf_msg(skb); rx = tipc_node_crypto_rx_by_addr(net, msg_prevnode(hdr)); if (unlikely(!rx)) goto exit; switch (msg_type(hdr)) { case KEY_DISTR_MSG: if (tipc_crypto_key_rcv(rx, hdr)) goto exit; break; default: break; } tipc_node_put(rx->node); exit: kfree_skb(skb); } /** * tipc_crypto_key_distr - Distribute a TX key * @tx: the TX crypto * @key: the key's index * @dest: the destination tipc node, = NULL if distributing to all nodes * * Return: 0 in case of success, otherwise < 0 */ int tipc_crypto_key_distr(struct tipc_crypto *tx, u8 key, struct tipc_node *dest) { struct tipc_aead *aead; u32 dnode = tipc_node_get_addr(dest); int rc = -ENOKEY; if (!sysctl_tipc_key_exchange_enabled) return 0; if (key) { rcu_read_lock(); aead = tipc_aead_get(tx->aead[key]); if (likely(aead)) { rc = tipc_crypto_key_xmit(tx->net, aead->key, aead->gen, aead->mode, dnode); tipc_aead_put(aead); } rcu_read_unlock(); } return rc; } /** * tipc_crypto_key_xmit - Send a session key * @net: the struct net * @skey: the session key to be sent * @gen: the key's generation * @mode: the key's mode * @dnode: the destination node address, = 0 if broadcasting to all nodes * * The session key 'skey' is packed in a TIPC v2 'MSG_CRYPTO/KEY_DISTR_MSG' * as its data section, then xmit-ed through the uc/bc link. * * Return: 0 in case of success, otherwise < 0 */ static int tipc_crypto_key_xmit(struct net *net, struct tipc_aead_key *skey, u16 gen, u8 mode, u32 dnode) { struct sk_buff_head pkts; struct tipc_msg *hdr; struct sk_buff *skb; u16 size, cong_link_cnt; u8 *data; int rc; size = tipc_aead_key_size(skey); skb = tipc_buf_acquire(INT_H_SIZE + size, GFP_ATOMIC); if (!skb) return -ENOMEM; hdr = buf_msg(skb); tipc_msg_init(tipc_own_addr(net), hdr, MSG_CRYPTO, KEY_DISTR_MSG, INT_H_SIZE, dnode); msg_set_size(hdr, INT_H_SIZE + size); msg_set_key_gen(hdr, gen); msg_set_key_mode(hdr, mode); data = msg_data(hdr); *((__be32 *)(data + TIPC_AEAD_ALG_NAME)) = htonl(skey->keylen); memcpy(data, skey->alg_name, TIPC_AEAD_ALG_NAME); memcpy(data + TIPC_AEAD_ALG_NAME + sizeof(__be32), skey->key, skey->keylen); __skb_queue_head_init(&pkts); __skb_queue_tail(&pkts, skb); if (dnode) rc = tipc_node_xmit(net, &pkts, dnode, 0); else rc = tipc_bcast_xmit(net, &pkts, &cong_link_cnt); return rc; } /** * tipc_crypto_key_rcv - Receive a session key * @rx: the RX crypto * @hdr: the TIPC v2 message incl. the receiving session key in its data * * This function retrieves the session key in the message from peer, then * schedules a RX work to attach the key to the corresponding RX crypto. * * Return: "true" if the key has been scheduled for attaching, otherwise * "false". */ static bool tipc_crypto_key_rcv(struct tipc_crypto *rx, struct tipc_msg *hdr) { struct tipc_crypto *tx = tipc_net(rx->net)->crypto_tx; struct tipc_aead_key *skey = NULL; u16 key_gen = msg_key_gen(hdr); u32 size = msg_data_sz(hdr); u8 *data = msg_data(hdr); unsigned int keylen; /* Verify whether the size can exist in the packet */ if (unlikely(size < sizeof(struct tipc_aead_key) + TIPC_AEAD_KEYLEN_MIN)) { pr_debug("%s: message data size is too small\n", rx->name); goto exit; } keylen = ntohl(*((__be32 *)(data + TIPC_AEAD_ALG_NAME))); /* Verify the supplied size values */ if (unlikely(size != keylen + sizeof(struct tipc_aead_key) || keylen > TIPC_AEAD_KEY_SIZE_MAX)) { pr_debug("%s: invalid MSG_CRYPTO key size\n", rx->name); goto exit; } spin_lock(&rx->lock); if (unlikely(rx->skey || (key_gen == rx->key_gen && rx->key.keys))) { pr_err("%s: key existed <%p>, gen %d vs %d\n", rx->name, rx->skey, key_gen, rx->key_gen); goto exit_unlock; } /* Allocate memory for the key */ skey = kmalloc(size, GFP_ATOMIC); if (unlikely(!skey)) { pr_err("%s: unable to allocate memory for skey\n", rx->name); goto exit_unlock; } /* Copy key from msg data */ skey->keylen = keylen; memcpy(skey->alg_name, data, TIPC_AEAD_ALG_NAME); memcpy(skey->key, data + TIPC_AEAD_ALG_NAME + sizeof(__be32), skey->keylen); rx->key_gen = key_gen; rx->skey_mode = msg_key_mode(hdr); rx->skey = skey; rx->nokey = 0; mb(); /* for nokey flag */ exit_unlock: spin_unlock(&rx->lock); exit: /* Schedule the key attaching on this crypto */ if (likely(skey && queue_delayed_work(tx->wq, &rx->work, 0))) return true; return false; } /** * tipc_crypto_work_rx - Scheduled RX works handler * @work: the struct RX work * * The function processes the previous scheduled works i.e. distributing TX key * or attaching a received session key on RX crypto. */ static void tipc_crypto_work_rx(struct work_struct *work) { struct delayed_work *dwork = to_delayed_work(work); struct tipc_crypto *rx = container_of(dwork, struct tipc_crypto, work); struct tipc_crypto *tx = tipc_net(rx->net)->crypto_tx; unsigned long delay = msecs_to_jiffies(5000); bool resched = false; u8 key; int rc; /* Case 1: Distribute TX key to peer if scheduled */ if (atomic_cmpxchg(&rx->key_distr, KEY_DISTR_SCHED, KEY_DISTR_COMPL) == KEY_DISTR_SCHED) { /* Always pick the newest one for distributing */ key = tx->key.pending ?: tx->key.active; rc = tipc_crypto_key_distr(tx, key, rx->node); if (unlikely(rc)) pr_warn("%s: unable to distr key[%d] to %s, err %d\n", tx->name, key, tipc_node_get_id_str(rx->node), rc); /* Sched for key_distr releasing */ resched = true; } else { atomic_cmpxchg(&rx->key_distr, KEY_DISTR_COMPL, 0); } /* Case 2: Attach a pending received session key from peer if any */ if (rx->skey) { rc = tipc_crypto_key_init(rx, rx->skey, rx->skey_mode, false); if (unlikely(rc < 0)) pr_warn("%s: unable to attach received skey, err %d\n", rx->name, rc); switch (rc) { case -EBUSY: case -ENOMEM: /* Resched the key attaching */ resched = true; break; default: synchronize_rcu(); kfree(rx->skey); rx->skey = NULL; break; } } if (resched && queue_delayed_work(tx->wq, &rx->work, delay)) return; tipc_node_put(rx->node); } /** * tipc_crypto_rekeying_sched - (Re)schedule rekeying w/o new interval * @tx: TX crypto * @changed: if the rekeying needs to be rescheduled with new interval * @new_intv: new rekeying interval (when "changed" = true) */ void tipc_crypto_rekeying_sched(struct tipc_crypto *tx, bool changed, u32 new_intv) { unsigned long delay; bool now = false; if (changed) { if (new_intv == TIPC_REKEYING_NOW) now = true; else tx->rekeying_intv = new_intv; cancel_delayed_work_sync(&tx->work); } if (tx->rekeying_intv || now) { delay = (now) ? 0 : tx->rekeying_intv * 60 * 1000; queue_delayed_work(tx->wq, &tx->work, msecs_to_jiffies(delay)); } } /** * tipc_crypto_work_tx - Scheduled TX works handler * @work: the struct TX work * * The function processes the previous scheduled work, i.e. key rekeying, by * generating a new session key based on current one, then attaching it to the * TX crypto and finally distributing it to peers. It also re-schedules the * rekeying if needed. */ static void tipc_crypto_work_tx(struct work_struct *work) { struct delayed_work *dwork = to_delayed_work(work); struct tipc_crypto *tx = container_of(dwork, struct tipc_crypto, work); struct tipc_aead_key *skey = NULL; struct tipc_key key = tx->key; struct tipc_aead *aead; int rc = -ENOMEM; if (unlikely(key.pending)) goto resched; /* Take current key as a template */ rcu_read_lock(); aead = rcu_dereference(tx->aead[key.active ?: KEY_MASTER]); if (unlikely(!aead)) { rcu_read_unlock(); /* At least one key should exist for securing */ return; } /* Lets duplicate it first */ skey = kmemdup(aead->key, tipc_aead_key_size(aead->key), GFP_ATOMIC); rcu_read_unlock(); /* Now, generate new key, initiate & distribute it */ if (likely(skey)) { rc = tipc_aead_key_generate(skey) ?: tipc_crypto_key_init(tx, skey, PER_NODE_KEY, false); if (likely(rc > 0)) rc = tipc_crypto_key_distr(tx, rc, NULL); kfree_sensitive(skey); } if (unlikely(rc)) pr_warn_ratelimited("%s: rekeying returns %d\n", tx->name, rc); resched: /* Re-schedule rekeying if any */ tipc_crypto_rekeying_sched(tx, false, 0); } |
11 1 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 | // SPDX-License-Identifier: GPL-2.0-only #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/types.h> #include <linux/module.h> #include <net/ip.h> #include <linux/ipv6.h> #include <linux/icmp.h> #include <net/ipv6.h> #include <net/tcp.h> #include <net/udp.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_tcpudp.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv6/ip6_tables.h> MODULE_DESCRIPTION("Xtables: TCP, UDP and UDP-Lite match"); MODULE_LICENSE("GPL"); MODULE_ALIAS("xt_tcp"); MODULE_ALIAS("xt_udp"); MODULE_ALIAS("ipt_udp"); MODULE_ALIAS("ipt_tcp"); MODULE_ALIAS("ip6t_udp"); MODULE_ALIAS("ip6t_tcp"); MODULE_ALIAS("ipt_icmp"); MODULE_ALIAS("ip6t_icmp6"); /* Returns 1 if the port is matched by the range, 0 otherwise */ static inline bool port_match(u_int16_t min, u_int16_t max, u_int16_t port, bool invert) { return (port >= min && port <= max) ^ invert; } static bool tcp_find_option(u_int8_t option, const struct sk_buff *skb, unsigned int protoff, unsigned int optlen, bool invert, bool *hotdrop) { /* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */ const u_int8_t *op; u_int8_t _opt[60 - sizeof(struct tcphdr)]; unsigned int i; pr_debug("finding option\n"); if (!optlen) return invert; /* If we don't have the whole header, drop packet. */ op = skb_header_pointer(skb, protoff + sizeof(struct tcphdr), optlen, _opt); if (op == NULL) { *hotdrop = true; return false; } for (i = 0; i < optlen; ) { if (op[i] == option) return !invert; if (op[i] < 2) i++; else i += op[i+1]?:1; } return invert; } static bool tcp_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct tcphdr *th; struct tcphdr _tcph; const struct xt_tcp *tcpinfo = par->matchinfo; if (par->fragoff != 0) { /* To quote Alan: Don't allow a fragment of TCP 8 bytes in. Nobody normal causes this. Its a cracker trying to break in by doing a flag overwrite to pass the direction checks. */ if (par->fragoff == 1) { pr_debug("Dropping evil TCP offset=1 frag.\n"); par->hotdrop = true; } /* Must not be a fragment. */ return false; } th = skb_header_pointer(skb, par->thoff, sizeof(_tcph), &_tcph); if (th == NULL) { /* We've been asked to examine this packet, and we can't. Hence, no choice but to drop. */ pr_debug("Dropping evil TCP offset=0 tinygram.\n"); par->hotdrop = true; return false; } if (!port_match(tcpinfo->spts[0], tcpinfo->spts[1], ntohs(th->source), !!(tcpinfo->invflags & XT_TCP_INV_SRCPT))) return false; if (!port_match(tcpinfo->dpts[0], tcpinfo->dpts[1], ntohs(th->dest), !!(tcpinfo->invflags & XT_TCP_INV_DSTPT))) return false; if (!NF_INVF(tcpinfo, XT_TCP_INV_FLAGS, (((unsigned char *)th)[13] & tcpinfo->flg_mask) == tcpinfo->flg_cmp)) return false; if (tcpinfo->option) { if (th->doff * 4 < sizeof(_tcph)) { par->hotdrop = true; return false; } if (!tcp_find_option(tcpinfo->option, skb, par->thoff, th->doff*4 - sizeof(_tcph), tcpinfo->invflags & XT_TCP_INV_OPTION, &par->hotdrop)) return false; } return true; } static int tcp_mt_check(const struct xt_mtchk_param *par) { const struct xt_tcp *tcpinfo = par->matchinfo; /* Must specify no unknown invflags */ return (tcpinfo->invflags & ~XT_TCP_INV_MASK) ? -EINVAL : 0; } static bool udp_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct udphdr *uh; struct udphdr _udph; const struct xt_udp *udpinfo = par->matchinfo; /* Must not be a fragment. */ if (par->fragoff != 0) return false; uh = skb_header_pointer(skb, par->thoff, sizeof(_udph), &_udph); if (uh == NULL) { /* We've been asked to examine this packet, and we can't. Hence, no choice but to drop. */ pr_debug("Dropping evil UDP tinygram.\n"); par->hotdrop = true; return false; } return port_match(udpinfo->spts[0], udpinfo->spts[1], ntohs(uh->source), !!(udpinfo->invflags & XT_UDP_INV_SRCPT)) && port_match(udpinfo->dpts[0], udpinfo->dpts[1], ntohs(uh->dest), !!(udpinfo->invflags & XT_UDP_INV_DSTPT)); } static int udp_mt_check(const struct xt_mtchk_param *par) { const struct xt_udp *udpinfo = par->matchinfo; /* Must specify no unknown invflags */ return (udpinfo->invflags & ~XT_UDP_INV_MASK) ? -EINVAL : 0; } /* Returns 1 if the type and code is matched by the range, 0 otherwise */ static bool type_code_in_range(u8 test_type, u8 min_code, u8 max_code, u8 type, u8 code) { return type == test_type && code >= min_code && code <= max_code; } static bool icmp_type_code_match(u8 test_type, u8 min_code, u8 max_code, u8 type, u8 code, bool invert) { return (test_type == 0xFF || type_code_in_range(test_type, min_code, max_code, type, code)) ^ invert; } static bool icmp6_type_code_match(u8 test_type, u8 min_code, u8 max_code, u8 type, u8 code, bool invert) { return type_code_in_range(test_type, min_code, max_code, type, code) ^ invert; } static bool icmp_match(const struct sk_buff *skb, struct xt_action_param *par) { const struct icmphdr *ic; struct icmphdr _icmph; const struct ipt_icmp *icmpinfo = par->matchinfo; /* Must not be a fragment. */ if (par->fragoff != 0) return false; ic = skb_header_pointer(skb, par->thoff, sizeof(_icmph), &_icmph); if (!ic) { /* We've been asked to examine this packet, and we * can't. Hence, no choice but to drop. */ par->hotdrop = true; return false; } return icmp_type_code_match(icmpinfo->type, icmpinfo->code[0], icmpinfo->code[1], ic->type, ic->code, !!(icmpinfo->invflags & IPT_ICMP_INV)); } static bool icmp6_match(const struct sk_buff *skb, struct xt_action_param *par) { const struct icmp6hdr *ic; struct icmp6hdr _icmph; const struct ip6t_icmp *icmpinfo = par->matchinfo; /* Must not be a fragment. */ if (par->fragoff != 0) return false; ic = skb_header_pointer(skb, par->thoff, sizeof(_icmph), &_icmph); if (!ic) { /* We've been asked to examine this packet, and we * can't. Hence, no choice but to drop. */ par->hotdrop = true; return false; } return icmp6_type_code_match(icmpinfo->type, icmpinfo->code[0], icmpinfo->code[1], ic->icmp6_type, ic->icmp6_code, !!(icmpinfo->invflags & IP6T_ICMP_INV)); } static int icmp_checkentry(const struct xt_mtchk_param *par) { const struct ipt_icmp *icmpinfo = par->matchinfo; return (icmpinfo->invflags & ~IPT_ICMP_INV) ? -EINVAL : 0; } static int icmp6_checkentry(const struct xt_mtchk_param *par) { const struct ip6t_icmp *icmpinfo = par->matchinfo; return (icmpinfo->invflags & ~IP6T_ICMP_INV) ? -EINVAL : 0; } static struct xt_match tcpudp_mt_reg[] __read_mostly = { { .name = "tcp", .family = NFPROTO_IPV4, .checkentry = tcp_mt_check, .match = tcp_mt, .matchsize = sizeof(struct xt_tcp), .proto = IPPROTO_TCP, .me = THIS_MODULE, }, { .name = "tcp", .family = NFPROTO_IPV6, .checkentry = tcp_mt_check, .match = tcp_mt, .matchsize = sizeof(struct xt_tcp), .proto = IPPROTO_TCP, .me = THIS_MODULE, }, { .name = "udp", .family = NFPROTO_IPV4, .checkentry = udp_mt_check, .match = udp_mt, .matchsize = sizeof(struct xt_udp), .proto = IPPROTO_UDP, .me = THIS_MODULE, }, { .name = "udp", .family = NFPROTO_IPV6, .checkentry = udp_mt_check, .match = udp_mt, .matchsize = sizeof(struct xt_udp), .proto = IPPROTO_UDP, .me = THIS_MODULE, }, { .name = "udplite", .family = NFPROTO_IPV4, .checkentry = udp_mt_check, .match = udp_mt, .matchsize = sizeof(struct xt_udp), .proto = IPPROTO_UDPLITE, .me = THIS_MODULE, }, { .name = "udplite", .family = NFPROTO_IPV6, .checkentry = udp_mt_check, .match = udp_mt, .matchsize = sizeof(struct xt_udp), .proto = IPPROTO_UDPLITE, .me = THIS_MODULE, }, { .name = "icmp", .match = icmp_match, .matchsize = sizeof(struct ipt_icmp), .checkentry = icmp_checkentry, .proto = IPPROTO_ICMP, .family = NFPROTO_IPV4, .me = THIS_MODULE, }, { .name = "icmp6", .match = icmp6_match, .matchsize = sizeof(struct ip6t_icmp), .checkentry = icmp6_checkentry, .proto = IPPROTO_ICMPV6, .family = NFPROTO_IPV6, .me = THIS_MODULE, }, }; static int __init tcpudp_mt_init(void) { return xt_register_matches(tcpudp_mt_reg, ARRAY_SIZE(tcpudp_mt_reg)); } static void __exit tcpudp_mt_exit(void) { xt_unregister_matches(tcpudp_mt_reg, ARRAY_SIZE(tcpudp_mt_reg)); } module_init(tcpudp_mt_init); module_exit(tcpudp_mt_exit); |
27229 628 27238 15053 15056 15073 15059 25623 25622 25617 25667 25632 60 60 128 128 60 60 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 | // SPDX-License-Identifier: GPL-2.0 #include <linux/debugfs.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/memblock.h> #include <linux/stacktrace.h> #include <linux/page_owner.h> #include <linux/jump_label.h> #include <linux/migrate.h> #include <linux/stackdepot.h> #include <linux/seq_file.h> #include <linux/memcontrol.h> #include <linux/sched/clock.h> #include "internal.h" /* * TODO: teach PAGE_OWNER_STACK_DEPTH (__dump_page_owner and save_stack) * to use off stack temporal storage */ #define PAGE_OWNER_STACK_DEPTH (16) struct page_owner { unsigned short order; short last_migrate_reason; gfp_t gfp_mask; depot_stack_handle_t handle; depot_stack_handle_t free_handle; u64 ts_nsec; u64 free_ts_nsec; char comm[TASK_COMM_LEN]; pid_t pid; pid_t tgid; pid_t free_pid; pid_t free_tgid; }; static bool page_owner_enabled __initdata; DEFINE_STATIC_KEY_FALSE(page_owner_inited); static depot_stack_handle_t dummy_handle; static depot_stack_handle_t failure_handle; static depot_stack_handle_t early_handle; static void init_early_allocated_pages(void); static int __init early_page_owner_param(char *buf) { int ret = kstrtobool(buf, &page_owner_enabled); if (page_owner_enabled) stack_depot_request_early_init(); return ret; } early_param("page_owner", early_page_owner_param); static __init bool need_page_owner(void) { return page_owner_enabled; } static __always_inline depot_stack_handle_t create_dummy_stack(void) { unsigned long entries[4]; unsigned int nr_entries; nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0); return stack_depot_save(entries, nr_entries, GFP_KERNEL); } static noinline void register_dummy_stack(void) { dummy_handle = create_dummy_stack(); } static noinline void register_failure_stack(void) { failure_handle = create_dummy_stack(); } static noinline void register_early_stack(void) { early_handle = create_dummy_stack(); } static __init void init_page_owner(void) { if (!page_owner_enabled) return; register_dummy_stack(); register_failure_stack(); register_early_stack(); static_branch_enable(&page_owner_inited); init_early_allocated_pages(); } struct page_ext_operations page_owner_ops = { .size = sizeof(struct page_owner), .need = need_page_owner, .init = init_page_owner, .need_shared_flags = true, }; static inline struct page_owner *get_page_owner(struct page_ext *page_ext) { return page_ext_data(page_ext, &page_owner_ops); } static noinline depot_stack_handle_t save_stack(gfp_t flags) { unsigned long entries[PAGE_OWNER_STACK_DEPTH]; depot_stack_handle_t handle; unsigned int nr_entries; /* * Avoid recursion. * * Sometimes page metadata allocation tracking requires more * memory to be allocated: * - when new stack trace is saved to stack depot */ if (current->in_page_owner) return dummy_handle; current->in_page_owner = 1; nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2); handle = stack_depot_save(entries, nr_entries, flags); if (!handle) handle = failure_handle; current->in_page_owner = 0; return handle; } void __reset_page_owner(struct page *page, unsigned short order) { int i; struct page_ext *page_ext; depot_stack_handle_t handle; struct page_owner *page_owner; u64 free_ts_nsec = local_clock(); page_ext = page_ext_get(page); if (unlikely(!page_ext)) return; handle = save_stack(GFP_NOWAIT | __GFP_NOWARN); for (i = 0; i < (1 << order); i++) { __clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); page_owner = get_page_owner(page_ext); page_owner->free_handle = handle; page_owner->free_ts_nsec = free_ts_nsec; page_owner->free_pid = current->pid; page_owner->free_tgid = current->tgid; page_ext = page_ext_next(page_ext); } page_ext_put(page_ext); } static inline void __set_page_owner_handle(struct page_ext *page_ext, depot_stack_handle_t handle, unsigned short order, gfp_t gfp_mask) { struct page_owner *page_owner; int i; u64 ts_nsec = local_clock(); for (i = 0; i < (1 << order); i++) { page_owner = get_page_owner(page_ext); page_owner->handle = handle; page_owner->order = order; page_owner->gfp_mask = gfp_mask; page_owner->last_migrate_reason = -1; page_owner->pid = current->pid; page_owner->tgid = current->tgid; page_owner->ts_nsec = ts_nsec; strscpy(page_owner->comm, current->comm, sizeof(page_owner->comm)); __set_bit(PAGE_EXT_OWNER, &page_ext->flags); __set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); page_ext = page_ext_next(page_ext); } } noinline void __set_page_owner(struct page *page, unsigned short order, gfp_t gfp_mask) { struct page_ext *page_ext; depot_stack_handle_t handle; handle = save_stack(gfp_mask); page_ext = page_ext_get(page); if (unlikely(!page_ext)) return; __set_page_owner_handle(page_ext, handle, order, gfp_mask); page_ext_put(page_ext); } void __set_page_owner_migrate_reason(struct page *page, int reason) { struct page_ext *page_ext = page_ext_get(page); struct page_owner *page_owner; if (unlikely(!page_ext)) return; page_owner = get_page_owner(page_ext); page_owner->last_migrate_reason = reason; page_ext_put(page_ext); } void __split_page_owner(struct page *page, unsigned int nr) { int i; struct page_ext *page_ext = page_ext_get(page); struct page_owner *page_owner; if (unlikely(!page_ext)) return; for (i = 0; i < nr; i++) { page_owner = get_page_owner(page_ext); page_owner->order = 0; page_ext = page_ext_next(page_ext); } page_ext_put(page_ext); } void __folio_copy_owner(struct folio *newfolio, struct folio *old) { struct page_ext *old_ext; struct page_ext *new_ext; struct page_owner *old_page_owner, *new_page_owner; old_ext = page_ext_get(&old->page); if (unlikely(!old_ext)) return; new_ext = page_ext_get(&newfolio->page); if (unlikely(!new_ext)) { page_ext_put(old_ext); return; } old_page_owner = get_page_owner(old_ext); new_page_owner = get_page_owner(new_ext); new_page_owner->order = old_page_owner->order; new_page_owner->gfp_mask = old_page_owner->gfp_mask; new_page_owner->last_migrate_reason = old_page_owner->last_migrate_reason; new_page_owner->handle = old_page_owner->handle; new_page_owner->pid = old_page_owner->pid; new_page_owner->tgid = old_page_owner->tgid; new_page_owner->free_pid = old_page_owner->free_pid; new_page_owner->free_tgid = old_page_owner->free_tgid; new_page_owner->ts_nsec = old_page_owner->ts_nsec; new_page_owner->free_ts_nsec = old_page_owner->ts_nsec; strcpy(new_page_owner->comm, old_page_owner->comm); /* * We don't clear the bit on the old folio as it's going to be freed * after migration. Until then, the info can be useful in case of * a bug, and the overall stats will be off a bit only temporarily. * Also, migrate_misplaced_transhuge_page() can still fail the * migration and then we want the old folio to retain the info. But * in that case we also don't need to explicitly clear the info from * the new page, which will be freed. */ __set_bit(PAGE_EXT_OWNER, &new_ext->flags); __set_bit(PAGE_EXT_OWNER_ALLOCATED, &new_ext->flags); page_ext_put(new_ext); page_ext_put(old_ext); } void pagetypeinfo_showmixedcount_print(struct seq_file *m, pg_data_t *pgdat, struct zone *zone) { struct page *page; struct page_ext *page_ext; struct page_owner *page_owner; unsigned long pfn, block_end_pfn; unsigned long end_pfn = zone_end_pfn(zone); unsigned long count[MIGRATE_TYPES] = { 0, }; int pageblock_mt, page_mt; int i; /* Scan block by block. First and last block may be incomplete */ pfn = zone->zone_start_pfn; /* * Walk the zone in pageblock_nr_pages steps. If a page block spans * a zone boundary, it will be double counted between zones. This does * not matter as the mixed block count will still be correct */ for (; pfn < end_pfn; ) { page = pfn_to_online_page(pfn); if (!page) { pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); continue; } block_end_pfn = pageblock_end_pfn(pfn); block_end_pfn = min(block_end_pfn, end_pfn); pageblock_mt = get_pageblock_migratetype(page); for (; pfn < block_end_pfn; pfn++) { /* The pageblock is online, no need to recheck. */ page = pfn_to_page(pfn); if (page_zone(page) != zone) continue; if (PageBuddy(page)) { unsigned long freepage_order; freepage_order = buddy_order_unsafe(page); if (freepage_order <= MAX_PAGE_ORDER) pfn += (1UL << freepage_order) - 1; continue; } if (PageReserved(page)) continue; page_ext = page_ext_get(page); if (unlikely(!page_ext)) continue; if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags)) goto ext_put_continue; page_owner = get_page_owner(page_ext); page_mt = gfp_migratetype(page_owner->gfp_mask); if (pageblock_mt != page_mt) { if (is_migrate_cma(pageblock_mt)) count[MIGRATE_MOVABLE]++; else count[pageblock_mt]++; pfn = block_end_pfn; page_ext_put(page_ext); break; } pfn += (1UL << page_owner->order) - 1; ext_put_continue: page_ext_put(page_ext); } } /* Print counts */ seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); for (i = 0; i < MIGRATE_TYPES; i++) seq_printf(m, "%12lu ", count[i]); seq_putc(m, '\n'); } /* * Looking for memcg information and print it out */ static inline int print_page_owner_memcg(char *kbuf, size_t count, int ret, struct page *page) { #ifdef CONFIG_MEMCG unsigned long memcg_data; struct mem_cgroup *memcg; bool online; char name[80]; rcu_read_lock(); memcg_data = READ_ONCE(page->memcg_data); if (!memcg_data) goto out_unlock; if (memcg_data & MEMCG_DATA_OBJCGS) ret += scnprintf(kbuf + ret, count - ret, "Slab cache page\n"); memcg = page_memcg_check(page); if (!memcg) goto out_unlock; online = (memcg->css.flags & CSS_ONLINE); cgroup_name(memcg->css.cgroup, name, sizeof(name)); ret += scnprintf(kbuf + ret, count - ret, "Charged %sto %smemcg %s\n", PageMemcgKmem(page) ? "(via objcg) " : "", online ? "" : "offline ", name); out_unlock: rcu_read_unlock(); #endif /* CONFIG_MEMCG */ return ret; } static ssize_t print_page_owner(char __user *buf, size_t count, unsigned long pfn, struct page *page, struct page_owner *page_owner, depot_stack_handle_t handle) { int ret, pageblock_mt, page_mt; char *kbuf; count = min_t(size_t, count, PAGE_SIZE); kbuf = kmalloc(count, GFP_KERNEL); if (!kbuf) return -ENOMEM; ret = scnprintf(kbuf, count, "Page allocated via order %u, mask %#x(%pGg), pid %d, tgid %d (%s), ts %llu ns\n", page_owner->order, page_owner->gfp_mask, &page_owner->gfp_mask, page_owner->pid, page_owner->tgid, page_owner->comm, page_owner->ts_nsec); /* Print information relevant to grouping pages by mobility */ pageblock_mt = get_pageblock_migratetype(page); page_mt = gfp_migratetype(page_owner->gfp_mask); ret += scnprintf(kbuf + ret, count - ret, "PFN 0x%lx type %s Block %lu type %s Flags %pGp\n", pfn, migratetype_names[page_mt], pfn >> pageblock_order, migratetype_names[pageblock_mt], &page->flags); ret += stack_depot_snprint(handle, kbuf + ret, count - ret, 0); if (ret >= count) goto err; if (page_owner->last_migrate_reason != -1) { ret += scnprintf(kbuf + ret, count - ret, "Page has been migrated, last migrate reason: %s\n", migrate_reason_names[page_owner->last_migrate_reason]); } ret = print_page_owner_memcg(kbuf, count, ret, page); ret += snprintf(kbuf + ret, count - ret, "\n"); if (ret >= count) goto err; if (copy_to_user(buf, kbuf, ret)) ret = -EFAULT; kfree(kbuf); return ret; err: kfree(kbuf); return -ENOMEM; } void __dump_page_owner(const struct page *page) { struct page_ext *page_ext = page_ext_get((void *)page); struct page_owner *page_owner; depot_stack_handle_t handle; gfp_t gfp_mask; int mt; if (unlikely(!page_ext)) { pr_alert("There is not page extension available.\n"); return; } page_owner = get_page_owner(page_ext); gfp_mask = page_owner->gfp_mask; mt = gfp_migratetype(gfp_mask); if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) { pr_alert("page_owner info is not present (never set?)\n"); page_ext_put(page_ext); return; } if (test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags)) pr_alert("page_owner tracks the page as allocated\n"); else pr_alert("page_owner tracks the page as freed\n"); pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d, tgid %d (%s), ts %llu, free_ts %llu\n", page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask, page_owner->pid, page_owner->tgid, page_owner->comm, page_owner->ts_nsec, page_owner->free_ts_nsec); handle = READ_ONCE(page_owner->handle); if (!handle) pr_alert("page_owner allocation stack trace missing\n"); else stack_depot_print(handle); handle = READ_ONCE(page_owner->free_handle); if (!handle) { pr_alert("page_owner free stack trace missing\n"); } else { pr_alert("page last free pid %d tgid %d stack trace:\n", page_owner->free_pid, page_owner->free_tgid); stack_depot_print(handle); } if (page_owner->last_migrate_reason != -1) pr_alert("page has been migrated, last migrate reason: %s\n", migrate_reason_names[page_owner->last_migrate_reason]); page_ext_put(page_ext); } static ssize_t read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) { unsigned long pfn; struct page *page; struct page_ext *page_ext; struct page_owner *page_owner; depot_stack_handle_t handle; if (!static_branch_unlikely(&page_owner_inited)) return -EINVAL; page = NULL; if (*ppos == 0) pfn = min_low_pfn; else pfn = *ppos; /* Find a valid PFN or the start of a MAX_ORDER_NR_PAGES area */ while (!pfn_valid(pfn) && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0) pfn++; /* Find an allocated page */ for (; pfn < max_pfn; pfn++) { /* * This temporary page_owner is required so * that we can avoid the context switches while holding * the rcu lock and copying the page owner information to * user through copy_to_user() or GFP_KERNEL allocations. */ struct page_owner page_owner_tmp; /* * If the new page is in a new MAX_ORDER_NR_PAGES area, * validate the area as existing, skip it if not */ if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0 && !pfn_valid(pfn)) { pfn += MAX_ORDER_NR_PAGES - 1; continue; } page = pfn_to_page(pfn); if (PageBuddy(page)) { unsigned long freepage_order = buddy_order_unsafe(page); if (freepage_order <= MAX_PAGE_ORDER) pfn += (1UL << freepage_order) - 1; continue; } page_ext = page_ext_get(page); if (unlikely(!page_ext)) continue; /* * Some pages could be missed by concurrent allocation or free, * because we don't hold the zone lock. */ if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) goto ext_put_continue; /* * Although we do have the info about past allocation of free * pages, it's not relevant for current memory usage. */ if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags)) goto ext_put_continue; page_owner = get_page_owner(page_ext); /* * Don't print "tail" pages of high-order allocations as that * would inflate the stats. */ if (!IS_ALIGNED(pfn, 1 << page_owner->order)) goto ext_put_continue; /* * Access to page_ext->handle isn't synchronous so we should * be careful to access it. */ handle = READ_ONCE(page_owner->handle); if (!handle) goto ext_put_continue; /* Record the next PFN to read in the file offset */ *ppos = pfn + 1; page_owner_tmp = *page_owner; page_ext_put(page_ext); return print_page_owner(buf, count, pfn, page, &page_owner_tmp, handle); ext_put_continue: page_ext_put(page_ext); } return 0; } static loff_t lseek_page_owner(struct file *file, loff_t offset, int orig) { switch (orig) { case SEEK_SET: file->f_pos = offset; break; case SEEK_CUR: file->f_pos += offset; break; default: return -EINVAL; } return file->f_pos; } static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) { unsigned long pfn = zone->zone_start_pfn; unsigned long end_pfn = zone_end_pfn(zone); unsigned long count = 0; /* * Walk the zone in pageblock_nr_pages steps. If a page block spans * a zone boundary, it will be double counted between zones. This does * not matter as the mixed block count will still be correct */ for (; pfn < end_pfn; ) { unsigned long block_end_pfn; if (!pfn_valid(pfn)) { pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); continue; } block_end_pfn = pageblock_end_pfn(pfn); block_end_pfn = min(block_end_pfn, end_pfn); for (; pfn < block_end_pfn; pfn++) { struct page *page = pfn_to_page(pfn); struct page_ext *page_ext; if (page_zone(page) != zone) continue; /* * To avoid having to grab zone->lock, be a little * careful when reading buddy page order. The only * danger is that we skip too much and potentially miss * some early allocated pages, which is better than * heavy lock contention. */ if (PageBuddy(page)) { unsigned long order = buddy_order_unsafe(page); if (order > 0 && order <= MAX_PAGE_ORDER) pfn += (1UL << order) - 1; continue; } if (PageReserved(page)) continue; page_ext = page_ext_get(page); if (unlikely(!page_ext)) continue; /* Maybe overlapping zone */ if (test_bit(PAGE_EXT_OWNER, &page_ext->flags)) goto ext_put_continue; /* Found early allocated page */ __set_page_owner_handle(page_ext, early_handle, 0, 0); count++; ext_put_continue: page_ext_put(page_ext); } cond_resched(); } pr_info("Node %d, zone %8s: page owner found early allocated %lu pages\n", pgdat->node_id, zone->name, count); } static void init_zones_in_node(pg_data_t *pgdat) { struct zone *zone; struct zone *node_zones = pgdat->node_zones; for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { if (!populated_zone(zone)) continue; init_pages_in_zone(pgdat, zone); } } static void init_early_allocated_pages(void) { pg_data_t *pgdat; for_each_online_pgdat(pgdat) init_zones_in_node(pgdat); } static const struct file_operations proc_page_owner_operations = { .read = read_page_owner, .llseek = lseek_page_owner, }; static int __init pageowner_init(void) { if (!static_branch_unlikely(&page_owner_inited)) { pr_info("page_owner is disabled\n"); return 0; } debugfs_create_file("page_owner", 0400, NULL, NULL, &proc_page_owner_operations); return 0; } late_initcall(pageowner_init) |
159 4 165 165 159 1883 2706 2712 74 2721 5 309 309 2166 166 2166 2166 165 2167 1822 1670 789 346 442 442 3 30 28 2 790 1 1 789 1 788 787 4 251 2463 1018 48 173 173 169 4 4 14 3 7 1 4 4 4 4 5 165 1044 1802 2470 2467 57 288 288 2470 242 2460 2469 165 165 1044 2134 1043 160 5 158 158 157 158 161 161 5 158 1821 42 3 733 1824 1824 42 3 28 721 103 3 53 56 3 774 762 761 29 107 139 1875 1874 7 7 148 325 57 1354 1306 454 133 45 73 111 143 1347 35 35 1349 1280 465 1348 1822 1669 42 733 25 53 52 1 1870 1747 447 3 436 241 334 380 268 112 335 1 1 1845 32 139 161 1846 1657 740 42 99 723 745 1237 1870 1870 3 1 8 4 1 5 1918 16 52 52 22 22 22 22 22 1 1 1 1686 1685 73 228 301 299 6 2 301 298 4 4 299 4 3 298 623 23 37 604 640 634 8 3 88 301 301 5 300 301 21 8 251 27 298 326 26 345 3 635 22 25 643 643 605 3 7 3 10 25 1 3 37 604 638 640 637 25 23 25 25 23 25 25 638 638 1 1 1 96 554 641 5 641 179 18 1 1 5 162 162 102 5 103 4 50 111 51 110 139 1 1 1 116 87 52 116 5 5 1 2 1 1 5 1 5 2 5 5 5 5 112 111 111 7 7 2 2 7 7 214 110 110 111 111 31 35 5 5 5 185 12 12 11 3 10 1 10 1965 1965 2456 2457 2130 1013 748 7 211 214 214 1965 2 10 1822 1822 1822 1 1822 1822 1751 96 1289 603 1265 636 96 1806 1039 837 18 18 593 1263 1262 12 1029 837 2457 2457 605 605 603 188 188 188 92 73 188 112 77 74 188 73 34 34 34 1 31 2 3 28 28 28 4 29 25 24 21 3 24 20 4 294 1 85 7 13 34 160 2 85 73 75 10 65 19 111 111 111 111 111 111 111 111 5 5 5 5 23 10 2 4 9 1 22 16 1 6 22 12 5 15 14 11 3 3 11 10 10 10 11 11 3 2 7 11 1 1 13 1 4 4 8 12 12 4 6 1 1 11 11 13 1 6 5 1 4 1 12 11 11 1 1 1 7 1 9 9 53 53 8 34 34 34 1 16 1 40 40 40 40 36 20 40 53 53 12 5 7 7 7 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com * Written by Alex Tomas <alex@clusterfs.com> * * Architecture independence: * Copyright (c) 2005, Bull S.A. * Written by Pierre Peiffer <pierre.peiffer@bull.net> */ /* * Extents support for EXT4 * * TODO: * - ext4*_error() should be used in some situations * - analyze all BUG()/BUG_ON(), use -EIO where appropriate * - smart tree reduction */ #include <linux/fs.h> #include <linux/time.h> #include <linux/jbd2.h> #include <linux/highuid.h> #include <linux/pagemap.h> #include <linux/quotaops.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/fiemap.h> #include <linux/iomap.h> #include <linux/sched/mm.h> #include "ext4_jbd2.h" #include "ext4_extents.h" #include "xattr.h" #include <trace/events/ext4.h> /* * used by extent splitting. */ #define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \ due to ENOSPC */ #define EXT4_EXT_MARK_UNWRIT1 0x2 /* mark first half unwritten */ #define EXT4_EXT_MARK_UNWRIT2 0x4 /* mark second half unwritten */ #define EXT4_EXT_DATA_VALID1 0x8 /* first half contains valid data */ #define EXT4_EXT_DATA_VALID2 0x10 /* second half contains valid data */ static __le32 ext4_extent_block_csum(struct inode *inode, struct ext4_extent_header *eh) { struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 csum; csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh, EXT4_EXTENT_TAIL_OFFSET(eh)); return cpu_to_le32(csum); } static int ext4_extent_block_csum_verify(struct inode *inode, struct ext4_extent_header *eh) { struct ext4_extent_tail *et; if (!ext4_has_metadata_csum(inode->i_sb)) return 1; et = find_ext4_extent_tail(eh); if (et->et_checksum != ext4_extent_block_csum(inode, eh)) return 0; return 1; } static void ext4_extent_block_csum_set(struct inode *inode, struct ext4_extent_header *eh) { struct ext4_extent_tail *et; if (!ext4_has_metadata_csum(inode->i_sb)) return; et = find_ext4_extent_tail(eh); et->et_checksum = ext4_extent_block_csum(inode, eh); } static int ext4_split_extent_at(handle_t *handle, struct inode *inode, struct ext4_ext_path **ppath, ext4_lblk_t split, int split_flag, int flags); static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped) { /* * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this * moment, get_block can be called only for blocks inside i_size since * page cache has been already dropped and writes are blocked by * i_rwsem. So we can safely drop the i_data_sem here. */ BUG_ON(EXT4_JOURNAL(inode) == NULL); ext4_discard_preallocations(inode); up_write(&EXT4_I(inode)->i_data_sem); *dropped = 1; return 0; } static void ext4_ext_drop_refs(struct ext4_ext_path *path) { int depth, i; if (!path) return; depth = path->p_depth; for (i = 0; i <= depth; i++, path++) { brelse(path->p_bh); path->p_bh = NULL; } } void ext4_free_ext_path(struct ext4_ext_path *path) { ext4_ext_drop_refs(path); kfree(path); } /* * Make sure 'handle' has at least 'check_cred' credits. If not, restart * transaction with 'restart_cred' credits. The function drops i_data_sem * when restarting transaction and gets it after transaction is restarted. * * The function returns 0 on success, 1 if transaction had to be restarted, * and < 0 in case of fatal error. */ int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode, int check_cred, int restart_cred, int revoke_cred) { int ret; int dropped = 0; ret = ext4_journal_ensure_credits_fn(handle, check_cred, restart_cred, revoke_cred, ext4_ext_trunc_restart_fn(inode, &dropped)); if (dropped) down_write(&EXT4_I(inode)->i_data_sem); return ret; } /* * could return: * - EROFS * - ENOMEM */ static int ext4_ext_get_access(handle_t *handle, struct inode *inode, struct ext4_ext_path *path) { int err = 0; if (path->p_bh) { /* path points to block */ BUFFER_TRACE(path->p_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, inode->i_sb, path->p_bh, EXT4_JTR_NONE); /* * The extent buffer's verified bit will be set again in * __ext4_ext_dirty(). We could leave an inconsistent * buffer if the extents updating procudure break off du * to some error happens, force to check it again. */ if (!err) clear_buffer_verified(path->p_bh); } /* path points to leaf/index in inode body */ /* we use in-core data, no need to protect them */ return err; } /* * could return: * - EROFS * - ENOMEM * - EIO */ static int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle, struct inode *inode, struct ext4_ext_path *path) { int err; WARN_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem)); if (path->p_bh) { ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh)); /* path points to block */ err = __ext4_handle_dirty_metadata(where, line, handle, inode, path->p_bh); /* Extents updating done, re-set verified flag */ if (!err) set_buffer_verified(path->p_bh); } else { /* path points to leaf/index in inode body */ err = ext4_mark_inode_dirty(handle, inode); } return err; } #define ext4_ext_dirty(handle, inode, path) \ __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path)) static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t block) { if (path) { int depth = path->p_depth; struct ext4_extent *ex; /* * Try to predict block placement assuming that we are * filling in a file which will eventually be * non-sparse --- i.e., in the case of libbfd writing * an ELF object sections out-of-order but in a way * the eventually results in a contiguous object or * executable file, or some database extending a table * space file. However, this is actually somewhat * non-ideal if we are writing a sparse file such as * qemu or KVM writing a raw image file that is going * to stay fairly sparse, since it will end up * fragmenting the file system's free space. Maybe we * should have some hueristics or some way to allow * userspace to pass a hint to file system, * especially if the latter case turns out to be * common. */ ex = path[depth].p_ext; if (ex) { ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex); ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block); if (block > ext_block) return ext_pblk + (block - ext_block); else return ext_pblk - (ext_block - block); } /* it looks like index is empty; * try to find starting block from index itself */ if (path[depth].p_bh) return path[depth].p_bh->b_blocknr; } /* OK. use inode's group */ return ext4_inode_to_goal_block(inode); } /* * Allocation for a meta data block */ static ext4_fsblk_t ext4_ext_new_meta_block(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, struct ext4_extent *ex, int *err, unsigned int flags) { ext4_fsblk_t goal, newblock; goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); newblock = ext4_new_meta_blocks(handle, inode, goal, flags, NULL, err); return newblock; } static inline int ext4_ext_space_block(struct inode *inode, int check) { int size; size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) / sizeof(struct ext4_extent); #ifdef AGGRESSIVE_TEST if (!check && size > 6) size = 6; #endif return size; } static inline int ext4_ext_space_block_idx(struct inode *inode, int check) { int size; size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) / sizeof(struct ext4_extent_idx); #ifdef AGGRESSIVE_TEST if (!check && size > 5) size = 5; #endif return size; } static inline int ext4_ext_space_root(struct inode *inode, int check) { int size; size = sizeof(EXT4_I(inode)->i_data); size -= sizeof(struct ext4_extent_header); size /= sizeof(struct ext4_extent); #ifdef AGGRESSIVE_TEST if (!check && size > 3) size = 3; #endif return size; } static inline int ext4_ext_space_root_idx(struct inode *inode, int check) { int size; size = sizeof(EXT4_I(inode)->i_data); size -= sizeof(struct ext4_extent_header); size /= sizeof(struct ext4_extent_idx); #ifdef AGGRESSIVE_TEST if (!check && size > 4) size = 4; #endif return size; } static inline int ext4_force_split_extent_at(handle_t *handle, struct inode *inode, struct ext4_ext_path **ppath, ext4_lblk_t lblk, int nofail) { struct ext4_ext_path *path = *ppath; int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext); int flags = EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO; if (nofail) flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL | EXT4_EX_NOFAIL; return ext4_split_extent_at(handle, inode, ppath, lblk, unwritten ? EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0, flags); } static int ext4_ext_max_entries(struct inode *inode, int depth) { int max; if (depth == ext_depth(inode)) { if (depth == 0) max = ext4_ext_space_root(inode, 1); else max = ext4_ext_space_root_idx(inode, 1); } else { if (depth == 0) max = ext4_ext_space_block(inode, 1); else max = ext4_ext_space_block_idx(inode, 1); } return max; } static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) { ext4_fsblk_t block = ext4_ext_pblock(ext); int len = ext4_ext_get_actual_len(ext); ext4_lblk_t lblock = le32_to_cpu(ext->ee_block); /* * We allow neither: * - zero length * - overflow/wrap-around */ if (lblock + len <= lblock) return 0; return ext4_inode_block_valid(inode, block, len); } static int ext4_valid_extent_idx(struct inode *inode, struct ext4_extent_idx *ext_idx) { ext4_fsblk_t block = ext4_idx_pblock(ext_idx); return ext4_inode_block_valid(inode, block, 1); } static int ext4_valid_extent_entries(struct inode *inode, struct ext4_extent_header *eh, ext4_lblk_t lblk, ext4_fsblk_t *pblk, int depth) { unsigned short entries; ext4_lblk_t lblock = 0; ext4_lblk_t cur = 0; if (eh->eh_entries == 0) return 1; entries = le16_to_cpu(eh->eh_entries); if (depth == 0) { /* leaf entries */ struct ext4_extent *ext = EXT_FIRST_EXTENT(eh); /* * The logical block in the first entry should equal to * the number in the index block. */ if (depth != ext_depth(inode) && lblk != le32_to_cpu(ext->ee_block)) return 0; while (entries) { if (!ext4_valid_extent(inode, ext)) return 0; /* Check for overlapping extents */ lblock = le32_to_cpu(ext->ee_block); if (lblock < cur) { *pblk = ext4_ext_pblock(ext); return 0; } cur = lblock + ext4_ext_get_actual_len(ext); ext++; entries--; } } else { struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh); /* * The logical block in the first entry should equal to * the number in the parent index block. */ if (depth != ext_depth(inode) && lblk != le32_to_cpu(ext_idx->ei_block)) return 0; while (entries) { if (!ext4_valid_extent_idx(inode, ext_idx)) return 0; /* Check for overlapping index extents */ lblock = le32_to_cpu(ext_idx->ei_block); if (lblock < cur) { *pblk = ext4_idx_pblock(ext_idx); return 0; } ext_idx++; entries--; cur = lblock + 1; } } return 1; } static int __ext4_ext_check(const char *function, unsigned int line, struct inode *inode, struct ext4_extent_header *eh, int depth, ext4_fsblk_t pblk, ext4_lblk_t lblk) { const char *error_msg; int max = 0, err = -EFSCORRUPTED; if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) { error_msg = "invalid magic"; goto corrupted; } if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) { error_msg = "unexpected eh_depth"; goto corrupted; } if (unlikely(eh->eh_max == 0)) { error_msg = "invalid eh_max"; goto corrupted; } max = ext4_ext_max_entries(inode, depth); if (unlikely(le16_to_cpu(eh->eh_max) > max)) { error_msg = "too large eh_max"; goto corrupted; } if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) { error_msg = "invalid eh_entries"; goto corrupted; } if (unlikely((eh->eh_entries == 0) && (depth > 0))) { error_msg = "eh_entries is 0 but eh_depth is > 0"; goto corrupted; } if (!ext4_valid_extent_entries(inode, eh, lblk, &pblk, depth)) { error_msg = "invalid extent entries"; goto corrupted; } if (unlikely(depth > 32)) { error_msg = "too large eh_depth"; goto corrupted; } /* Verify checksum on non-root extent tree nodes */ if (ext_depth(inode) != depth && !ext4_extent_block_csum_verify(inode, eh)) { error_msg = "extent tree corrupted"; err = -EFSBADCRC; goto corrupted; } return 0; corrupted: ext4_error_inode_err(inode, function, line, 0, -err, "pblk %llu bad header/extent: %s - magic %x, " "entries %u, max %u(%u), depth %u(%u)", (unsigned long long) pblk, error_msg, le16_to_cpu(eh->eh_magic), le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), max, le16_to_cpu(eh->eh_depth), depth); return err; } #define ext4_ext_check(inode, eh, depth, pblk) \ __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk), 0) int ext4_ext_check_inode(struct inode *inode) { return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0); } static void ext4_cache_extents(struct inode *inode, struct ext4_extent_header *eh) { struct ext4_extent *ex = EXT_FIRST_EXTENT(eh); ext4_lblk_t prev = 0; int i; for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) { unsigned int status = EXTENT_STATUS_WRITTEN; ext4_lblk_t lblk = le32_to_cpu(ex->ee_block); int len = ext4_ext_get_actual_len(ex); if (prev && (prev != lblk)) ext4_es_cache_extent(inode, prev, lblk - prev, ~0, EXTENT_STATUS_HOLE); if (ext4_ext_is_unwritten(ex)) status = EXTENT_STATUS_UNWRITTEN; ext4_es_cache_extent(inode, lblk, len, ext4_ext_pblock(ex), status); prev = lblk + len; } } static struct buffer_head * __read_extent_tree_block(const char *function, unsigned int line, struct inode *inode, struct ext4_extent_idx *idx, int depth, int flags) { struct buffer_head *bh; int err; gfp_t gfp_flags = __GFP_MOVABLE | GFP_NOFS; ext4_fsblk_t pblk; if (flags & EXT4_EX_NOFAIL) gfp_flags |= __GFP_NOFAIL; pblk = ext4_idx_pblock(idx); bh = sb_getblk_gfp(inode->i_sb, pblk, gfp_flags); if (unlikely(!bh)) return ERR_PTR(-ENOMEM); if (!bh_uptodate_or_lock(bh)) { trace_ext4_ext_load_extent(inode, pblk, _RET_IP_); err = ext4_read_bh(bh, 0, NULL); if (err < 0) goto errout; } if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE)) return bh; err = __ext4_ext_check(function, line, inode, ext_block_hdr(bh), depth, pblk, le32_to_cpu(idx->ei_block)); if (err) goto errout; set_buffer_verified(bh); /* * If this is a leaf block, cache all of its entries */ if (!(flags & EXT4_EX_NOCACHE) && depth == 0) { struct ext4_extent_header *eh = ext_block_hdr(bh); ext4_cache_extents(inode, eh); } return bh; errout: put_bh(bh); return ERR_PTR(err); } #define read_extent_tree_block(inode, idx, depth, flags) \ __read_extent_tree_block(__func__, __LINE__, (inode), (idx), \ (depth), (flags)) /* * This function is called to cache a file's extent information in the * extent status tree */ int ext4_ext_precache(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_ext_path *path = NULL; struct buffer_head *bh; int i = 0, depth, ret = 0; if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return 0; /* not an extent-mapped inode */ down_read(&ei->i_data_sem); depth = ext_depth(inode); /* Don't cache anything if there are no external extent blocks */ if (!depth) { up_read(&ei->i_data_sem); return ret; } path = kcalloc(depth + 1, sizeof(struct ext4_ext_path), GFP_NOFS); if (path == NULL) { up_read(&ei->i_data_sem); return -ENOMEM; } path[0].p_hdr = ext_inode_hdr(inode); ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0); if (ret) goto out; path[0].p_idx = EXT_FIRST_INDEX(path[0].p_hdr); while (i >= 0) { /* * If this is a leaf block or we've reached the end of * the index block, go up */ if ((i == depth) || path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) { brelse(path[i].p_bh); path[i].p_bh = NULL; i--; continue; } bh = read_extent_tree_block(inode, path[i].p_idx++, depth - i - 1, EXT4_EX_FORCE_CACHE); if (IS_ERR(bh)) { ret = PTR_ERR(bh); break; } i++; path[i].p_bh = bh; path[i].p_hdr = ext_block_hdr(bh); path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr); } ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED); out: up_read(&ei->i_data_sem); ext4_free_ext_path(path); return ret; } #ifdef EXT_DEBUG static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) { int k, l = path->p_depth; ext_debug(inode, "path:"); for (k = 0; k <= l; k++, path++) { if (path->p_idx) { ext_debug(inode, " %d->%llu", le32_to_cpu(path->p_idx->ei_block), ext4_idx_pblock(path->p_idx)); } else if (path->p_ext) { ext_debug(inode, " %d:[%d]%d:%llu ", le32_to_cpu(path->p_ext->ee_block), ext4_ext_is_unwritten(path->p_ext), ext4_ext_get_actual_len(path->p_ext), ext4_ext_pblock(path->p_ext)); } else ext_debug(inode, " []"); } ext_debug(inode, "\n"); } static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path) { int depth = ext_depth(inode); struct ext4_extent_header *eh; struct ext4_extent *ex; int i; if (!path) return; eh = path[depth].p_hdr; ex = EXT_FIRST_EXTENT(eh); ext_debug(inode, "Displaying leaf extents\n"); for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) { ext_debug(inode, "%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block), ext4_ext_is_unwritten(ex), ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex)); } ext_debug(inode, "\n"); } static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path, ext4_fsblk_t newblock, int level) { int depth = ext_depth(inode); struct ext4_extent *ex; if (depth != level) { struct ext4_extent_idx *idx; idx = path[level].p_idx; while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) { ext_debug(inode, "%d: move %d:%llu in new index %llu\n", level, le32_to_cpu(idx->ei_block), ext4_idx_pblock(idx), newblock); idx++; } return; } ex = path[depth].p_ext; while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) { ext_debug(inode, "move %d:%llu:[%d]%d in new leaf %llu\n", le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex), ext4_ext_is_unwritten(ex), ext4_ext_get_actual_len(ex), newblock); ex++; } } #else #define ext4_ext_show_path(inode, path) #define ext4_ext_show_leaf(inode, path) #define ext4_ext_show_move(inode, path, newblock, level) #endif /* * ext4_ext_binsearch_idx: * binary search for the closest index of the given block * the header must be checked before calling this */ static void ext4_ext_binsearch_idx(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t block) { struct ext4_extent_header *eh = path->p_hdr; struct ext4_extent_idx *r, *l, *m; ext_debug(inode, "binsearch for %u(idx): ", block); l = EXT_FIRST_INDEX(eh) + 1; r = EXT_LAST_INDEX(eh); while (l <= r) { m = l + (r - l) / 2; ext_debug(inode, "%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block), m, le32_to_cpu(m->ei_block), r, le32_to_cpu(r->ei_block)); if (block < le32_to_cpu(m->ei_block)) r = m - 1; else l = m + 1; } path->p_idx = l - 1; ext_debug(inode, " -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block), ext4_idx_pblock(path->p_idx)); #ifdef CHECK_BINSEARCH { struct ext4_extent_idx *chix, *ix; int k; chix = ix = EXT_FIRST_INDEX(eh); for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) { if (k != 0 && le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) { printk(KERN_DEBUG "k=%d, ix=0x%p, " "first=0x%p\n", k, ix, EXT_FIRST_INDEX(eh)); printk(KERN_DEBUG "%u <= %u\n", le32_to_cpu(ix->ei_block), le32_to_cpu(ix[-1].ei_block)); } BUG_ON(k && le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)); if (block < le32_to_cpu(ix->ei_block)) break; chix = ix; } BUG_ON(chix != path->p_idx); } #endif } /* * ext4_ext_binsearch: * binary search for closest extent of the given block * the header must be checked before calling this */ static void ext4_ext_binsearch(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t block) { struct ext4_extent_header *eh = path->p_hdr; struct ext4_extent *r, *l, *m; if (eh->eh_entries == 0) { /* * this leaf is empty: * we get such a leaf in split/add case */ return; } ext_debug(inode, "binsearch for %u: ", block); l = EXT_FIRST_EXTENT(eh) + 1; r = EXT_LAST_EXTENT(eh); while (l <= r) { m = l + (r - l) / 2; ext_debug(inode, "%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block), m, le32_to_cpu(m->ee_block), r, le32_to_cpu(r->ee_block)); if (block < le32_to_cpu(m->ee_block)) r = m - 1; else l = m + 1; } path->p_ext = l - 1; ext_debug(inode, " -> %d:%llu:[%d]%d ", le32_to_cpu(path->p_ext->ee_block), ext4_ext_pblock(path->p_ext), ext4_ext_is_unwritten(path->p_ext), ext4_ext_get_actual_len(path->p_ext)); #ifdef CHECK_BINSEARCH { struct ext4_extent *chex, *ex; int k; chex = ex = EXT_FIRST_EXTENT(eh); for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) { BUG_ON(k && le32_to_cpu(ex->ee_block) <= le32_to_cpu(ex[-1].ee_block)); if (block < le32_to_cpu(ex->ee_block)) break; chex = ex; } BUG_ON(chex != path->p_ext); } #endif } void ext4_ext_tree_init(handle_t *handle, struct inode *inode) { struct ext4_extent_header *eh; eh = ext_inode_hdr(inode); eh->eh_depth = 0; eh->eh_entries = 0; eh->eh_magic = EXT4_EXT_MAGIC; eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0)); eh->eh_generation = 0; ext4_mark_inode_dirty(handle, inode); } struct ext4_ext_path * ext4_find_extent(struct inode *inode, ext4_lblk_t block, struct ext4_ext_path **orig_path, int flags) { struct ext4_extent_header *eh; struct buffer_head *bh; struct ext4_ext_path *path = orig_path ? *orig_path : NULL; short int depth, i, ppos = 0; int ret; gfp_t gfp_flags = GFP_NOFS; if (flags & EXT4_EX_NOFAIL) gfp_flags |= __GFP_NOFAIL; eh = ext_inode_hdr(inode); depth = ext_depth(inode); if (depth < 0 || depth > EXT4_MAX_EXTENT_DEPTH) { EXT4_ERROR_INODE(inode, "inode has invalid extent depth: %d", depth); ret = -EFSCORRUPTED; goto err; } if (path) { ext4_ext_drop_refs(path); if (depth > path[0].p_maxdepth) { kfree(path); *orig_path = path = NULL; } } if (!path) { /* account possible depth increase */ path = kcalloc(depth + 2, sizeof(struct ext4_ext_path), gfp_flags); if (unlikely(!path)) return ERR_PTR(-ENOMEM); path[0].p_maxdepth = depth + 1; } path[0].p_hdr = eh; path[0].p_bh = NULL; i = depth; if (!(flags & EXT4_EX_NOCACHE) && depth == 0) ext4_cache_extents(inode, eh); /* walk through the tree */ while (i) { ext_debug(inode, "depth %d: num %d, max %d\n", ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); ext4_ext_binsearch_idx(inode, path + ppos, block); path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx); path[ppos].p_depth = i; path[ppos].p_ext = NULL; bh = read_extent_tree_block(inode, path[ppos].p_idx, --i, flags); if (IS_ERR(bh)) { ret = PTR_ERR(bh); goto err; } eh = ext_block_hdr(bh); ppos++; path[ppos].p_bh = bh; path[ppos].p_hdr = eh; } path[ppos].p_depth = i; path[ppos].p_ext = NULL; path[ppos].p_idx = NULL; /* find extent */ ext4_ext_binsearch(inode, path + ppos, block); /* if not an empty leaf */ if (path[ppos].p_ext) path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext); ext4_ext_show_path(inode, path); return path; err: ext4_free_ext_path(path); if (orig_path) *orig_path = NULL; return ERR_PTR(ret); } /* * ext4_ext_insert_index: * insert new index [@logical;@ptr] into the block at @curp; * check where to insert: before @curp or after @curp */ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, struct ext4_ext_path *curp, int logical, ext4_fsblk_t ptr) { struct ext4_extent_idx *ix; int len, err; err = ext4_ext_get_access(handle, inode, curp); if (err) return err; if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) { EXT4_ERROR_INODE(inode, "logical %d == ei_block %d!", logical, le32_to_cpu(curp->p_idx->ei_block)); return -EFSCORRUPTED; } if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries) >= le16_to_cpu(curp->p_hdr->eh_max))) { EXT4_ERROR_INODE(inode, "eh_entries %d >= eh_max %d!", le16_to_cpu(curp->p_hdr->eh_entries), le16_to_cpu(curp->p_hdr->eh_max)); return -EFSCORRUPTED; } if (logical > le32_to_cpu(curp->p_idx->ei_block)) { /* insert after */ ext_debug(inode, "insert new index %d after: %llu\n", logical, ptr); ix = curp->p_idx + 1; } else { /* insert before */ ext_debug(inode, "insert new index %d before: %llu\n", logical, ptr); ix = curp->p_idx; } if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) { EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!"); return -EFSCORRUPTED; } len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1; BUG_ON(len < 0); if (len > 0) { ext_debug(inode, "insert new index %d: " "move %d indices from 0x%p to 0x%p\n", logical, len, ix, ix + 1); memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx)); } ix->ei_block = cpu_to_le32(logical); ext4_idx_store_pblock(ix, ptr); le16_add_cpu(&curp->p_hdr->eh_entries, 1); if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) { EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!"); return -EFSCORRUPTED; } err = ext4_ext_dirty(handle, inode, curp); ext4_std_error(inode->i_sb, err); return err; } /* * ext4_ext_split: * inserts new subtree into the path, using free index entry * at depth @at: * - allocates all needed blocks (new leaf and all intermediate index blocks) * - makes decision where to split * - moves remaining extents and index entries (right to the split point) * into the newly allocated blocks * - initializes subtree */ static int ext4_ext_split(handle_t *handle, struct inode *inode, unsigned int flags, struct ext4_ext_path *path, struct ext4_extent *newext, int at) { struct buffer_head *bh = NULL; int depth = ext_depth(inode); struct ext4_extent_header *neh; struct ext4_extent_idx *fidx; int i = at, k, m, a; ext4_fsblk_t newblock, oldblock; __le32 border; ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */ gfp_t gfp_flags = GFP_NOFS; int err = 0; size_t ext_size = 0; if (flags & EXT4_EX_NOFAIL) gfp_flags |= __GFP_NOFAIL; /* make decision: where to split? */ /* FIXME: now decision is simplest: at current extent */ /* if current leaf will be split, then we should use * border from split point */ if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) { EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!"); return -EFSCORRUPTED; } if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { border = path[depth].p_ext[1].ee_block; ext_debug(inode, "leaf will be split." " next leaf starts at %d\n", le32_to_cpu(border)); } else { border = newext->ee_block; ext_debug(inode, "leaf will be added." " next leaf starts at %d\n", le32_to_cpu(border)); } /* * If error occurs, then we break processing * and mark filesystem read-only. index won't * be inserted and tree will be in consistent * state. Next mount will repair buffers too. */ /* * Get array to track all allocated blocks. * We need this to handle errors and free blocks * upon them. */ ablocks = kcalloc(depth, sizeof(ext4_fsblk_t), gfp_flags); if (!ablocks) return -ENOMEM; /* allocate all needed blocks */ ext_debug(inode, "allocate %d blocks for indexes/leaf\n", depth - at); for (a = 0; a < depth - at; a++) { newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err, flags); if (newblock == 0) goto cleanup; ablocks[a] = newblock; } /* initialize new leaf */ newblock = ablocks[--a]; if (unlikely(newblock == 0)) { EXT4_ERROR_INODE(inode, "newblock == 0!"); err = -EFSCORRUPTED; goto cleanup; } bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS); if (unlikely(!bh)) { err = -ENOMEM; goto cleanup; } lock_buffer(bh); err = ext4_journal_get_create_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); if (err) goto cleanup; neh = ext_block_hdr(bh); neh->eh_entries = 0; neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); neh->eh_magic = EXT4_EXT_MAGIC; neh->eh_depth = 0; neh->eh_generation = 0; /* move remainder of path[depth] to the new leaf */ if (unlikely(path[depth].p_hdr->eh_entries != path[depth].p_hdr->eh_max)) { EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!", path[depth].p_hdr->eh_entries, path[depth].p_hdr->eh_max); err = -EFSCORRUPTED; goto cleanup; } /* start copy from next extent */ m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++; ext4_ext_show_move(inode, path, newblock, depth); if (m) { struct ext4_extent *ex; ex = EXT_FIRST_EXTENT(neh); memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m); le16_add_cpu(&neh->eh_entries, m); } /* zero out unused area in the extent block */ ext_size = sizeof(struct ext4_extent_header) + sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries); memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size); ext4_extent_block_csum_set(inode, neh); set_buffer_uptodate(bh); unlock_buffer(bh); err = ext4_handle_dirty_metadata(handle, inode, bh); if (err) goto cleanup; brelse(bh); bh = NULL; /* correct old leaf */ if (m) { err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto cleanup; le16_add_cpu(&path[depth].p_hdr->eh_entries, -m); err = ext4_ext_dirty(handle, inode, path + depth); if (err) goto cleanup; } /* create intermediate indexes */ k = depth - at - 1; if (unlikely(k < 0)) { EXT4_ERROR_INODE(inode, "k %d < 0!", k); err = -EFSCORRUPTED; goto cleanup; } if (k) ext_debug(inode, "create %d intermediate indices\n", k); /* insert new index into current index block */ /* current depth stored in i var */ i = depth - 1; while (k--) { oldblock = newblock; newblock = ablocks[--a]; bh = sb_getblk(inode->i_sb, newblock); if (unlikely(!bh)) { err = -ENOMEM; goto cleanup; } lock_buffer(bh); err = ext4_journal_get_create_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); if (err) goto cleanup; neh = ext_block_hdr(bh); neh->eh_entries = cpu_to_le16(1); neh->eh_magic = EXT4_EXT_MAGIC; neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0)); neh->eh_depth = cpu_to_le16(depth - i); neh->eh_generation = 0; fidx = EXT_FIRST_INDEX(neh); fidx->ei_block = border; ext4_idx_store_pblock(fidx, oldblock); ext_debug(inode, "int.index at %d (block %llu): %u -> %llu\n", i, newblock, le32_to_cpu(border), oldblock); /* move remainder of path[i] to the new index block */ if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) != EXT_LAST_INDEX(path[i].p_hdr))) { EXT4_ERROR_INODE(inode, "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!", le32_to_cpu(path[i].p_ext->ee_block)); err = -EFSCORRUPTED; goto cleanup; } /* start copy indexes */ m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++; ext_debug(inode, "cur 0x%p, last 0x%p\n", path[i].p_idx, EXT_MAX_INDEX(path[i].p_hdr)); ext4_ext_show_move(inode, path, newblock, i); if (m) { memmove(++fidx, path[i].p_idx, sizeof(struct ext4_extent_idx) * m); le16_add_cpu(&neh->eh_entries, m); } /* zero out unused area in the extent block */ ext_size = sizeof(struct ext4_extent_header) + (sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries)); memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size); ext4_extent_block_csum_set(inode, neh); set_buffer_uptodate(bh); unlock_buffer(bh); err = ext4_handle_dirty_metadata(handle, inode, bh); if (err) goto cleanup; brelse(bh); bh = NULL; /* correct old index */ if (m) { err = ext4_ext_get_access(handle, inode, path + i); if (err) goto cleanup; le16_add_cpu(&path[i].p_hdr->eh_entries, -m); err = ext4_ext_dirty(handle, inode, path + i); if (err) goto cleanup; } i--; } /* insert new index */ err = ext4_ext_insert_index(handle, inode, path + at, le32_to_cpu(border), newblock); cleanup: if (bh) { if (buffer_locked(bh)) unlock_buffer(bh); brelse(bh); } if (err) { /* free all allocated blocks in error case */ for (i = 0; i < depth; i++) { if (!ablocks[i]) continue; ext4_free_blocks(handle, inode, NULL, ablocks[i], 1, EXT4_FREE_BLOCKS_METADATA); } } kfree(ablocks); return err; } /* * ext4_ext_grow_indepth: * implements tree growing procedure: * - allocates new block * - moves top-level data (index block or leaf) into the new block * - initializes new top-level, creating index that points to the * just created block */ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, unsigned int flags) { struct ext4_extent_header *neh; struct buffer_head *bh; ext4_fsblk_t newblock, goal = 0; struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; int err = 0; size_t ext_size = 0; /* Try to prepend new index to old one */ if (ext_depth(inode)) goal = ext4_idx_pblock(EXT_FIRST_INDEX(ext_inode_hdr(inode))); if (goal > le32_to_cpu(es->s_first_data_block)) { flags |= EXT4_MB_HINT_TRY_GOAL; goal--; } else goal = ext4_inode_to_goal_block(inode); newblock = ext4_new_meta_blocks(handle, inode, goal, flags, NULL, &err); if (newblock == 0) return err; bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS); if (unlikely(!bh)) return -ENOMEM; lock_buffer(bh); err = ext4_journal_get_create_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); if (err) { unlock_buffer(bh); goto out; } ext_size = sizeof(EXT4_I(inode)->i_data); /* move top-level index/leaf into new block */ memmove(bh->b_data, EXT4_I(inode)->i_data, ext_size); /* zero out unused area in the extent block */ memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size); /* set size of new block */ neh = ext_block_hdr(bh); /* old root could have indexes or leaves * so calculate e_max right way */ if (ext_depth(inode)) neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0)); else neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); neh->eh_magic = EXT4_EXT_MAGIC; ext4_extent_block_csum_set(inode, neh); set_buffer_uptodate(bh); set_buffer_verified(bh); unlock_buffer(bh); err = ext4_handle_dirty_metadata(handle, inode, bh); if (err) goto out; /* Update top-level index: num,max,pointer */ neh = ext_inode_hdr(inode); neh->eh_entries = cpu_to_le16(1); ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock); if (neh->eh_depth == 0) { /* Root extent block becomes index block */ neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0)); EXT_FIRST_INDEX(neh)->ei_block = EXT_FIRST_EXTENT(neh)->ee_block; } ext_debug(inode, "new root: num %d(%d), lblock %d, ptr %llu\n", le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max), le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block), ext4_idx_pblock(EXT_FIRST_INDEX(neh))); le16_add_cpu(&neh->eh_depth, 1); err = ext4_mark_inode_dirty(handle, inode); out: brelse(bh); return err; } /* * ext4_ext_create_new_leaf: * finds empty index and adds new leaf. * if no free index is found, then it requests in-depth growing. */ static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, unsigned int mb_flags, unsigned int gb_flags, struct ext4_ext_path **ppath, struct ext4_extent *newext) { struct ext4_ext_path *path = *ppath; struct ext4_ext_path *curp; int depth, i, err = 0; repeat: i = depth = ext_depth(inode); /* walk up to the tree and look for free index entry */ curp = path + depth; while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) { i--; curp--; } /* we use already allocated block for index block, * so subsequent data blocks should be contiguous */ if (EXT_HAS_FREE_INDEX(curp)) { /* if we found index with free entry, then use that * entry: create all needed subtree and add new leaf */ err = ext4_ext_split(handle, inode, mb_flags, path, newext, i); if (err) goto out; /* refill path */ path = ext4_find_extent(inode, (ext4_lblk_t)le32_to_cpu(newext->ee_block), ppath, gb_flags); if (IS_ERR(path)) err = PTR_ERR(path); } else { /* tree is full, time to grow in depth */ err = ext4_ext_grow_indepth(handle, inode, mb_flags); if (err) goto out; /* refill path */ path = ext4_find_extent(inode, (ext4_lblk_t)le32_to_cpu(newext->ee_block), ppath, gb_flags); if (IS_ERR(path)) { err = PTR_ERR(path); goto out; } /* * only first (depth 0 -> 1) produces free space; * in all other cases we have to split the grown tree */ depth = ext_depth(inode); if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) { /* now we need to split */ goto repeat; } } out: return err; } /* * search the closest allocated block to the left for *logical * and returns it at @logical + it's physical address at @phys * if *logical is the smallest allocated block, the function * returns 0 at @phys * return value contains 0 (success) or error code */ static int ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t *logical, ext4_fsblk_t *phys) { struct ext4_extent_idx *ix; struct ext4_extent *ex; int depth, ee_len; if (unlikely(path == NULL)) { EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical); return -EFSCORRUPTED; } depth = path->p_depth; *phys = 0; if (depth == 0 && path->p_ext == NULL) return 0; /* usually extent in the path covers blocks smaller * then *logical, but it can be that extent is the * first one in the file */ ex = path[depth].p_ext; ee_len = ext4_ext_get_actual_len(ex); if (*logical < le32_to_cpu(ex->ee_block)) { if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) { EXT4_ERROR_INODE(inode, "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!", *logical, le32_to_cpu(ex->ee_block)); return -EFSCORRUPTED; } while (--depth >= 0) { ix = path[depth].p_idx; if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) { EXT4_ERROR_INODE(inode, "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!", ix != NULL ? le32_to_cpu(ix->ei_block) : 0, le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block), depth); return -EFSCORRUPTED; } } return 0; } if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) { EXT4_ERROR_INODE(inode, "logical %d < ee_block %d + ee_len %d!", *logical, le32_to_cpu(ex->ee_block), ee_len); return -EFSCORRUPTED; } *logical = le32_to_cpu(ex->ee_block) + ee_len - 1; *phys = ext4_ext_pblock(ex) + ee_len - 1; return 0; } /* * Search the closest allocated block to the right for *logical * and returns it at @logical + it's physical address at @phys. * If not exists, return 0 and @phys is set to 0. We will return * 1 which means we found an allocated block and ret_ex is valid. * Or return a (< 0) error code. */ static int ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t *logical, ext4_fsblk_t *phys, struct ext4_extent *ret_ex) { struct buffer_head *bh = NULL; struct ext4_extent_header *eh; struct ext4_extent_idx *ix; struct ext4_extent *ex; int depth; /* Note, NOT eh_depth; depth from top of tree */ int ee_len; if (unlikely(path == NULL)) { EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical); return -EFSCORRUPTED; } depth = path->p_depth; *phys = 0; if (depth == 0 && path->p_ext == NULL) return 0; /* usually extent in the path covers blocks smaller * then *logical, but it can be that extent is the * first one in the file */ ex = path[depth].p_ext; ee_len = ext4_ext_get_actual_len(ex); if (*logical < le32_to_cpu(ex->ee_block)) { if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) { EXT4_ERROR_INODE(inode, "first_extent(path[%d].p_hdr) != ex", depth); return -EFSCORRUPTED; } while (--depth >= 0) { ix = path[depth].p_idx; if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) { EXT4_ERROR_INODE(inode, "ix != EXT_FIRST_INDEX *logical %d!", *logical); return -EFSCORRUPTED; } } goto found_extent; } if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) { EXT4_ERROR_INODE(inode, "logical %d < ee_block %d + ee_len %d!", *logical, le32_to_cpu(ex->ee_block), ee_len); return -EFSCORRUPTED; } if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) { /* next allocated block in this leaf */ ex++; goto found_extent; } /* go up and search for index to the right */ while (--depth >= 0) { ix = path[depth].p_idx; if (ix != EXT_LAST_INDEX(path[depth].p_hdr)) goto got_index; } /* we've gone up to the root and found no index to the right */ return 0; got_index: /* we've found index to the right, let's * follow it and find the closest allocated * block to the right */ ix++; while (++depth < path->p_depth) { /* subtract from p_depth to get proper eh_depth */ bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0); if (IS_ERR(bh)) return PTR_ERR(bh); eh = ext_block_hdr(bh); ix = EXT_FIRST_INDEX(eh); put_bh(bh); } bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0); if (IS_ERR(bh)) return PTR_ERR(bh); eh = ext_block_hdr(bh); ex = EXT_FIRST_EXTENT(eh); found_extent: *logical = le32_to_cpu(ex->ee_block); *phys = ext4_ext_pblock(ex); if (ret_ex) *ret_ex = *ex; if (bh) put_bh(bh); return 1; } /* * ext4_ext_next_allocated_block: * returns allocated block in subsequent extent or EXT_MAX_BLOCKS. * NOTE: it considers block number from index entry as * allocated block. Thus, index entries have to be consistent * with leaves. */ ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path) { int depth; BUG_ON(path == NULL); depth = path->p_depth; if (depth == 0 && path->p_ext == NULL) return EXT_MAX_BLOCKS; while (depth >= 0) { struct ext4_ext_path *p = &path[depth]; if (depth == path->p_depth) { /* leaf */ if (p->p_ext && p->p_ext != EXT_LAST_EXTENT(p->p_hdr)) return le32_to_cpu(p->p_ext[1].ee_block); } else { /* index */ if (p->p_idx != EXT_LAST_INDEX(p->p_hdr)) return le32_to_cpu(p->p_idx[1].ei_block); } depth--; } return EXT_MAX_BLOCKS; } /* * ext4_ext_next_leaf_block: * returns first allocated block from next leaf or EXT_MAX_BLOCKS */ static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path) { int depth; BUG_ON(path == NULL); depth = path->p_depth; /* zero-tree has no leaf blocks at all */ if (depth == 0) return EXT_MAX_BLOCKS; /* go to index block */ depth--; while (depth >= 0) { if (path[depth].p_idx != EXT_LAST_INDEX(path[depth].p_hdr)) return (ext4_lblk_t) le32_to_cpu(path[depth].p_idx[1].ei_block); depth--; } return EXT_MAX_BLOCKS; } /* * ext4_ext_correct_indexes: * if leaf gets modified and modified extent is first in the leaf, * then we have to correct all indexes above. * TODO: do we need to correct tree in all cases? */ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode, struct ext4_ext_path *path) { struct ext4_extent_header *eh; int depth = ext_depth(inode); struct ext4_extent *ex; __le32 border; int k, err = 0; eh = path[depth].p_hdr; ex = path[depth].p_ext; if (unlikely(ex == NULL || eh == NULL)) { EXT4_ERROR_INODE(inode, "ex %p == NULL or eh %p == NULL", ex, eh); return -EFSCORRUPTED; } if (depth == 0) { /* there is no tree at all */ return 0; } if (ex != EXT_FIRST_EXTENT(eh)) { /* we correct tree if first leaf got modified only */ return 0; } /* * TODO: we need correction if border is smaller than current one */ k = depth - 1; border = path[depth].p_ext->ee_block; err = ext4_ext_get_access(handle, inode, path + k); if (err) return err; path[k].p_idx->ei_block = border; err = ext4_ext_dirty(handle, inode, path + k); if (err) return err; while (k--) { /* change all left-side indexes */ if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr)) break; err = ext4_ext_get_access(handle, inode, path + k); if (err) break; path[k].p_idx->ei_block = border; err = ext4_ext_dirty(handle, inode, path + k); if (err) break; } return err; } static int ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, struct ext4_extent *ex2) { unsigned short ext1_ee_len, ext2_ee_len; if (ext4_ext_is_unwritten(ex1) != ext4_ext_is_unwritten(ex2)) return 0; ext1_ee_len = ext4_ext_get_actual_len(ex1); ext2_ee_len = ext4_ext_get_actual_len(ex2); if (le32_to_cpu(ex1->ee_block) + ext1_ee_len != le32_to_cpu(ex2->ee_block)) return 0; if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN) return 0; if (ext4_ext_is_unwritten(ex1) && ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN) return 0; #ifdef AGGRESSIVE_TEST if (ext1_ee_len >= 4) return 0; #endif if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2)) return 1; return 0; } /* * This function tries to merge the "ex" extent to the next extent in the tree. * It always tries to merge towards right. If you want to merge towards * left, pass "ex - 1" as argument instead of "ex". * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns * 1 if they got merged. */ static int ext4_ext_try_to_merge_right(struct inode *inode, struct ext4_ext_path *path, struct ext4_extent *ex) { struct ext4_extent_header *eh; unsigned int depth, len; int merge_done = 0, unwritten; depth = ext_depth(inode); BUG_ON(path[depth].p_hdr == NULL); eh = path[depth].p_hdr; while (ex < EXT_LAST_EXTENT(eh)) { if (!ext4_can_extents_be_merged(inode, ex, ex + 1)) break; /* merge with next extent! */ unwritten = ext4_ext_is_unwritten(ex); ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + ext4_ext_get_actual_len(ex + 1)); if (unwritten) ext4_ext_mark_unwritten(ex); if (ex + 1 < EXT_LAST_EXTENT(eh)) { len = (EXT_LAST_EXTENT(eh) - ex - 1) * sizeof(struct ext4_extent); memmove(ex + 1, ex + 2, len); } le16_add_cpu(&eh->eh_entries, -1); merge_done = 1; WARN_ON(eh->eh_entries == 0); if (!eh->eh_entries) EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!"); } return merge_done; } /* * This function does a very simple check to see if we can collapse * an extent tree with a single extent tree leaf block into the inode. */ static void ext4_ext_try_to_merge_up(handle_t *handle, struct inode *inode, struct ext4_ext_path *path) { size_t s; unsigned max_root = ext4_ext_space_root(inode, 0); ext4_fsblk_t blk; if ((path[0].p_depth != 1) || (le16_to_cpu(path[0].p_hdr->eh_entries) != 1) || (le16_to_cpu(path[1].p_hdr->eh_entries) > max_root)) return; /* * We need to modify the block allocation bitmap and the block * group descriptor to release the extent tree block. If we * can't get the journal credits, give up. */ if (ext4_journal_extend(handle, 2, ext4_free_metadata_revoke_credits(inode->i_sb, 1))) return; /* * Copy the extent data up to the inode */ blk = ext4_idx_pblock(path[0].p_idx); s = le16_to_cpu(path[1].p_hdr->eh_entries) * sizeof(struct ext4_extent_idx); s += sizeof(struct ext4_extent_header); path[1].p_maxdepth = path[0].p_maxdepth; memcpy(path[0].p_hdr, path[1].p_hdr, s); path[0].p_depth = 0; path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) + (path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr)); path[0].p_hdr->eh_max = cpu_to_le16(max_root); brelse(path[1].p_bh); ext4_free_blocks(handle, inode, NULL, blk, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); } /* * This function tries to merge the @ex extent to neighbours in the tree, then * tries to collapse the extent tree into the inode. */ static void ext4_ext_try_to_merge(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, struct ext4_extent *ex) { struct ext4_extent_header *eh; unsigned int depth; int merge_done = 0; depth = ext_depth(inode); BUG_ON(path[depth].p_hdr == NULL); eh = path[depth].p_hdr; if (ex > EXT_FIRST_EXTENT(eh)) merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1); if (!merge_done) (void) ext4_ext_try_to_merge_right(inode, path, ex); ext4_ext_try_to_merge_up(handle, inode, path); } /* * check if a portion of the "newext" extent overlaps with an * existing extent. * * If there is an overlap discovered, it updates the length of the newext * such that there will be no overlap, and then returns 1. * If there is no overlap found, it returns 0. */ static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi, struct inode *inode, struct ext4_extent *newext, struct ext4_ext_path *path) { ext4_lblk_t b1, b2; unsigned int depth, len1; unsigned int ret = 0; b1 = le32_to_cpu(newext->ee_block); len1 = ext4_ext_get_actual_len(newext); depth = ext_depth(inode); if (!path[depth].p_ext) goto out; b2 = EXT4_LBLK_CMASK(sbi, le32_to_cpu(path[depth].p_ext->ee_block)); /* * get the next allocated block if the extent in the path * is before the requested block(s) */ if (b2 < b1) { b2 = ext4_ext_next_allocated_block(path); if (b2 == EXT_MAX_BLOCKS) goto out; b2 = EXT4_LBLK_CMASK(sbi, b2); } /* check for wrap through zero on extent logical start block*/ if (b1 + len1 < b1) { len1 = EXT_MAX_BLOCKS - b1; newext->ee_len = cpu_to_le16(len1); ret = 1; } /* check for overlap */ if (b1 + len1 > b2) { newext->ee_len = cpu_to_le16(b2 - b1); ret = 1; } out: return ret; } /* * ext4_ext_insert_extent: * tries to merge requested extent into the existing extent or * inserts requested extent as new one into the tree, * creating new leaf in the no-space case. */ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, struct ext4_ext_path **ppath, struct ext4_extent *newext, int gb_flags) { struct ext4_ext_path *path = *ppath; struct ext4_extent_header *eh; struct ext4_extent *ex, *fex; struct ext4_extent *nearex; /* nearest extent */ struct ext4_ext_path *npath = NULL; int depth, len, err; ext4_lblk_t next; int mb_flags = 0, unwritten; if (gb_flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) mb_flags |= EXT4_MB_DELALLOC_RESERVED; if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); return -EFSCORRUPTED; } depth = ext_depth(inode); ex = path[depth].p_ext; eh = path[depth].p_hdr; if (unlikely(path[depth].p_hdr == NULL)) { EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); return -EFSCORRUPTED; } /* try to insert block into found extent and return */ if (ex && !(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) { /* * Try to see whether we should rather test the extent on * right from ex, or from the left of ex. This is because * ext4_find_extent() can return either extent on the * left, or on the right from the searched position. This * will make merging more effective. */ if (ex < EXT_LAST_EXTENT(eh) && (le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex) < le32_to_cpu(newext->ee_block))) { ex += 1; goto prepend; } else if ((ex > EXT_FIRST_EXTENT(eh)) && (le32_to_cpu(newext->ee_block) + ext4_ext_get_actual_len(newext) < le32_to_cpu(ex->ee_block))) ex -= 1; /* Try to append newex to the ex */ if (ext4_can_extents_be_merged(inode, ex, newext)) { ext_debug(inode, "append [%d]%d block to %u:[%d]%d" "(from %llu)\n", ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), le32_to_cpu(ex->ee_block), ext4_ext_is_unwritten(ex), ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex)); err = ext4_ext_get_access(handle, inode, path + depth); if (err) return err; unwritten = ext4_ext_is_unwritten(ex); ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + ext4_ext_get_actual_len(newext)); if (unwritten) ext4_ext_mark_unwritten(ex); nearex = ex; goto merge; } prepend: /* Try to prepend newex to the ex */ if (ext4_can_extents_be_merged(inode, newext, ex)) { ext_debug(inode, "prepend %u[%d]%d block to %u:[%d]%d" "(from %llu)\n", le32_to_cpu(newext->ee_block), ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), le32_to_cpu(ex->ee_block), ext4_ext_is_unwritten(ex), ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex)); err = ext4_ext_get_access(handle, inode, path + depth); if (err) return err; unwritten = ext4_ext_is_unwritten(ex); ex->ee_block = newext->ee_block; ext4_ext_store_pblock(ex, ext4_ext_pblock(newext)); ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + ext4_ext_get_actual_len(newext)); if (unwritten) ext4_ext_mark_unwritten(ex); nearex = ex; goto merge; } } depth = ext_depth(inode); eh = path[depth].p_hdr; if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) goto has_space; /* probably next leaf has space for us? */ fex = EXT_LAST_EXTENT(eh); next = EXT_MAX_BLOCKS; if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)) next = ext4_ext_next_leaf_block(path); if (next != EXT_MAX_BLOCKS) { ext_debug(inode, "next leaf block - %u\n", next); BUG_ON(npath != NULL); npath = ext4_find_extent(inode, next, NULL, gb_flags); if (IS_ERR(npath)) return PTR_ERR(npath); BUG_ON(npath->p_depth != path->p_depth); eh = npath[depth].p_hdr; if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) { ext_debug(inode, "next leaf isn't full(%d)\n", le16_to_cpu(eh->eh_entries)); path = npath; goto has_space; } ext_debug(inode, "next leaf has no free space(%d,%d)\n", le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); } /* * There is no free space in the found leaf. * We're gonna add a new leaf in the tree. */ if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL) mb_flags |= EXT4_MB_USE_RESERVED; err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags, ppath, newext); if (err) goto cleanup; depth = ext_depth(inode); eh = path[depth].p_hdr; has_space: nearex = path[depth].p_ext; err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto cleanup; if (!nearex) { /* there is no extent in this leaf, create first one */ ext_debug(inode, "first extent in the leaf: %u:%llu:[%d]%d\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext)); nearex = EXT_FIRST_EXTENT(eh); } else { if (le32_to_cpu(newext->ee_block) > le32_to_cpu(nearex->ee_block)) { /* Insert after */ ext_debug(inode, "insert %u:%llu:[%d]%d before: " "nearest %p\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), nearex); nearex++; } else { /* Insert before */ BUG_ON(newext->ee_block == nearex->ee_block); ext_debug(inode, "insert %u:%llu:[%d]%d after: " "nearest %p\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), nearex); } len = EXT_LAST_EXTENT(eh) - nearex + 1; if (len > 0) { ext_debug(inode, "insert %u:%llu:[%d]%d: " "move %d extents from 0x%p to 0x%p\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), len, nearex, nearex + 1); memmove(nearex + 1, nearex, len * sizeof(struct ext4_extent)); } } le16_add_cpu(&eh->eh_entries, 1); path[depth].p_ext = nearex; nearex->ee_block = newext->ee_block; ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext)); nearex->ee_len = newext->ee_len; merge: /* try to merge extents */ if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) ext4_ext_try_to_merge(handle, inode, path, nearex); /* time to correct all indexes above */ err = ext4_ext_correct_indexes(handle, inode, path); if (err) goto cleanup; err = ext4_ext_dirty(handle, inode, path + path->p_depth); cleanup: ext4_free_ext_path(npath); return err; } static int ext4_fill_es_cache_info(struct inode *inode, ext4_lblk_t block, ext4_lblk_t num, struct fiemap_extent_info *fieinfo) { ext4_lblk_t next, end = block + num - 1; struct extent_status es; unsigned char blksize_bits = inode->i_sb->s_blocksize_bits; unsigned int flags; int err; while (block <= end) { next = 0; flags = 0; if (!ext4_es_lookup_extent(inode, block, &next, &es)) break; if (ext4_es_is_unwritten(&es)) flags |= FIEMAP_EXTENT_UNWRITTEN; if (ext4_es_is_delayed(&es)) flags |= (FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN); if (ext4_es_is_hole(&es)) flags |= EXT4_FIEMAP_EXTENT_HOLE; if (next == 0) flags |= FIEMAP_EXTENT_LAST; if (flags & (FIEMAP_EXTENT_DELALLOC| EXT4_FIEMAP_EXTENT_HOLE)) es.es_pblk = 0; else es.es_pblk = ext4_es_pblock(&es); err = fiemap_fill_next_extent(fieinfo, (__u64)es.es_lblk << blksize_bits, (__u64)es.es_pblk << blksize_bits, (__u64)es.es_len << blksize_bits, flags); if (next == 0) break; block = next; if (err < 0) return err; if (err == 1) return 0; } return 0; } /* * ext4_ext_find_hole - find hole around given block according to the given path * @inode: inode we lookup in * @path: path in extent tree to @lblk * @lblk: pointer to logical block around which we want to determine hole * * Determine hole length (and start if easily possible) around given logical * block. We don't try too hard to find the beginning of the hole but @path * actually points to extent before @lblk, we provide it. * * The function returns the length of a hole starting at @lblk. We update @lblk * to the beginning of the hole if we managed to find it. */ static ext4_lblk_t ext4_ext_find_hole(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t *lblk) { int depth = ext_depth(inode); struct ext4_extent *ex; ext4_lblk_t len; ex = path[depth].p_ext; if (ex == NULL) { /* there is no extent yet, so gap is [0;-] */ *lblk = 0; len = EXT_MAX_BLOCKS; } else if (*lblk < le32_to_cpu(ex->ee_block)) { len = le32_to_cpu(ex->ee_block) - *lblk; } else if (*lblk >= le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex)) { ext4_lblk_t next; *lblk = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex); next = ext4_ext_next_allocated_block(path); BUG_ON(next == *lblk); len = next - *lblk; } else { BUG(); } return len; } /* * ext4_ext_rm_idx: * removes index from the index block. */ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, int depth) { int err; ext4_fsblk_t leaf; /* free index block */ depth--; path = path + depth; leaf = ext4_idx_pblock(path->p_idx); if (unlikely(path->p_hdr->eh_entries == 0)) { EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0"); return -EFSCORRUPTED; } err = ext4_ext_get_access(handle, inode, path); if (err) return err; if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) { int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx; len *= sizeof(struct ext4_extent_idx); memmove(path->p_idx, path->p_idx + 1, len); } le16_add_cpu(&path->p_hdr->eh_entries, -1); err = ext4_ext_dirty(handle, inode, path); if (err) return err; ext_debug(inode, "index is empty, remove it, free block %llu\n", leaf); trace_ext4_ext_rm_idx(inode, leaf); ext4_free_blocks(handle, inode, NULL, leaf, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); while (--depth >= 0) { if (path->p_idx != EXT_FIRST_INDEX(path->p_hdr)) break; path--; err = ext4_ext_get_access(handle, inode, path); if (err) break; path->p_idx->ei_block = (path+1)->p_idx->ei_block; err = ext4_ext_dirty(handle, inode, path); if (err) break; } return err; } /* * ext4_ext_calc_credits_for_single_extent: * This routine returns max. credits that needed to insert an extent * to the extent tree. * When pass the actual path, the caller should calculate credits * under i_data_sem. */ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks, struct ext4_ext_path *path) { if (path) { int depth = ext_depth(inode); int ret = 0; /* probably there is space in leaf? */ if (le16_to_cpu(path[depth].p_hdr->eh_entries) < le16_to_cpu(path[depth].p_hdr->eh_max)) { /* * There are some space in the leaf tree, no * need to account for leaf block credit * * bitmaps and block group descriptor blocks * and other metadata blocks still need to be * accounted. */ /* 1 bitmap, 1 block group descriptor */ ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb); return ret; } } return ext4_chunk_trans_blocks(inode, nrblocks); } /* * How many index/leaf blocks need to change/allocate to add @extents extents? * * If we add a single extent, then in the worse case, each tree level * index/leaf need to be changed in case of the tree split. * * If more extents are inserted, they could cause the whole tree split more * than once, but this is really rare. */ int ext4_ext_index_trans_blocks(struct inode *inode, int extents) { int index; int depth; /* If we are converting the inline data, only one is needed here. */ if (ext4_has_inline_data(inode)) return 1; depth = ext_depth(inode); if (extents <= 1) index = depth * 2; else index = depth * 3; return index; } static inline int get_default_free_blocks_flags(struct inode *inode) { if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) || ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE)) return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET; else if (ext4_should_journal_data(inode)) return EXT4_FREE_BLOCKS_FORGET; return 0; } /* * ext4_rereserve_cluster - increment the reserved cluster count when * freeing a cluster with a pending reservation * * @inode - file containing the cluster * @lblk - logical block in cluster to be reserved * * Increments the reserved cluster count and adjusts quota in a bigalloc * file system when freeing a partial cluster containing at least one * delayed and unwritten block. A partial cluster meeting that * requirement will have a pending reservation. If so, the * RERESERVE_CLUSTER flag is used when calling ext4_free_blocks() to * defer reserved and allocated space accounting to a subsequent call * to this function. */ static void ext4_rereserve_cluster(struct inode *inode, ext4_lblk_t lblk) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); dquot_reclaim_block(inode, EXT4_C2B(sbi, 1)); spin_lock(&ei->i_block_reservation_lock); ei->i_reserved_data_blocks++; percpu_counter_add(&sbi->s_dirtyclusters_counter, 1); spin_unlock(&ei->i_block_reservation_lock); percpu_counter_add(&sbi->s_freeclusters_counter, 1); ext4_remove_pending(inode, lblk); } static int ext4_remove_blocks(handle_t *handle, struct inode *inode, struct ext4_extent *ex, struct partial_cluster *partial, ext4_lblk_t from, ext4_lblk_t to) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); unsigned short ee_len = ext4_ext_get_actual_len(ex); ext4_fsblk_t last_pblk, pblk; ext4_lblk_t num; int flags; /* only extent tail removal is allowed */ if (from < le32_to_cpu(ex->ee_block) || to != le32_to_cpu(ex->ee_block) + ee_len - 1) { ext4_error(sbi->s_sb, "strange request: removal(2) %u-%u from %u:%u", from, to, le32_to_cpu(ex->ee_block), ee_len); return 0; } #ifdef EXTENTS_STATS spin_lock(&sbi->s_ext_stats_lock); sbi->s_ext_blocks += ee_len; sbi->s_ext_extents++; if (ee_len < sbi->s_ext_min) sbi->s_ext_min = ee_len; if (ee_len > sbi->s_ext_max) sbi->s_ext_max = ee_len; if (ext_depth(inode) > sbi->s_depth_max) sbi->s_depth_max = ext_depth(inode); spin_unlock(&sbi->s_ext_stats_lock); #endif trace_ext4_remove_blocks(inode, ex, from, to, partial); /* * if we have a partial cluster, and it's different from the * cluster of the last block in the extent, we free it */ last_pblk = ext4_ext_pblock(ex) + ee_len - 1; if (partial->state != initial && partial->pclu != EXT4_B2C(sbi, last_pblk)) { if (partial->state == tofree) { flags = get_default_free_blocks_flags(inode); if (ext4_is_pending(inode, partial->lblk)) flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; ext4_free_blocks(handle, inode, NULL, EXT4_C2B(sbi, partial->pclu), sbi->s_cluster_ratio, flags); if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) ext4_rereserve_cluster(inode, partial->lblk); } partial->state = initial; } num = le32_to_cpu(ex->ee_block) + ee_len - from; pblk = ext4_ext_pblock(ex) + ee_len - num; /* * We free the partial cluster at the end of the extent (if any), * unless the cluster is used by another extent (partial_cluster * state is nofree). If a partial cluster exists here, it must be * shared with the last block in the extent. */ flags = get_default_free_blocks_flags(inode); /* partial, left end cluster aligned, right end unaligned */ if ((EXT4_LBLK_COFF(sbi, to) != sbi->s_cluster_ratio - 1) && (EXT4_LBLK_CMASK(sbi, to) >= from) && (partial->state != nofree)) { if (ext4_is_pending(inode, to)) flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; ext4_free_blocks(handle, inode, NULL, EXT4_PBLK_CMASK(sbi, last_pblk), sbi->s_cluster_ratio, flags); if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) ext4_rereserve_cluster(inode, to); partial->state = initial; flags = get_default_free_blocks_flags(inode); } flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER; /* * For bigalloc file systems, we never free a partial cluster * at the beginning of the extent. Instead, we check to see if we * need to free it on a subsequent call to ext4_remove_blocks, * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space. */ flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER; ext4_free_blocks(handle, inode, NULL, pblk, num, flags); /* reset the partial cluster if we've freed past it */ if (partial->state != initial && partial->pclu != EXT4_B2C(sbi, pblk)) partial->state = initial; /* * If we've freed the entire extent but the beginning is not left * cluster aligned and is not marked as ineligible for freeing we * record the partial cluster at the beginning of the extent. It * wasn't freed by the preceding ext4_free_blocks() call, and we * need to look farther to the left to determine if it's to be freed * (not shared with another extent). Else, reset the partial * cluster - we're either done freeing or the beginning of the * extent is left cluster aligned. */ if (EXT4_LBLK_COFF(sbi, from) && num == ee_len) { if (partial->state == initial) { partial->pclu = EXT4_B2C(sbi, pblk); partial->lblk = from; partial->state = tofree; } } else { partial->state = initial; } return 0; } /* * ext4_ext_rm_leaf() Removes the extents associated with the * blocks appearing between "start" and "end". Both "start" * and "end" must appear in the same extent or EIO is returned. * * @handle: The journal handle * @inode: The files inode * @path: The path to the leaf * @partial_cluster: The cluster which we'll have to free if all extents * has been released from it. However, if this value is * negative, it's a cluster just to the right of the * punched region and it must not be freed. * @start: The first block to remove * @end: The last block to remove */ static int ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, struct partial_cluster *partial, ext4_lblk_t start, ext4_lblk_t end) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int err = 0, correct_index = 0; int depth = ext_depth(inode), credits, revoke_credits; struct ext4_extent_header *eh; ext4_lblk_t a, b; unsigned num; ext4_lblk_t ex_ee_block; unsigned short ex_ee_len; unsigned unwritten = 0; struct ext4_extent *ex; ext4_fsblk_t pblk; /* the header must be checked already in ext4_ext_remove_space() */ ext_debug(inode, "truncate since %u in leaf to %u\n", start, end); if (!path[depth].p_hdr) path[depth].p_hdr = ext_block_hdr(path[depth].p_bh); eh = path[depth].p_hdr; if (unlikely(path[depth].p_hdr == NULL)) { EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); return -EFSCORRUPTED; } /* find where to start removing */ ex = path[depth].p_ext; if (!ex) ex = EXT_LAST_EXTENT(eh); ex_ee_block = le32_to_cpu(ex->ee_block); ex_ee_len = ext4_ext_get_actual_len(ex); trace_ext4_ext_rm_leaf(inode, start, ex, partial); while (ex >= EXT_FIRST_EXTENT(eh) && ex_ee_block + ex_ee_len > start) { if (ext4_ext_is_unwritten(ex)) unwritten = 1; else unwritten = 0; ext_debug(inode, "remove ext %u:[%d]%d\n", ex_ee_block, unwritten, ex_ee_len); path[depth].p_ext = ex; a = max(ex_ee_block, start); b = min(ex_ee_block + ex_ee_len - 1, end); ext_debug(inode, " border %u:%u\n", a, b); /* If this extent is beyond the end of the hole, skip it */ if (end < ex_ee_block) { /* * We're going to skip this extent and move to another, * so note that its first cluster is in use to avoid * freeing it when removing blocks. Eventually, the * right edge of the truncated/punched region will * be just to the left. */ if (sbi->s_cluster_ratio > 1) { pblk = ext4_ext_pblock(ex); partial->pclu = EXT4_B2C(sbi, pblk); partial->state = nofree; } ex--; ex_ee_block = le32_to_cpu(ex->ee_block); ex_ee_len = ext4_ext_get_actual_len(ex); continue; } else if (b != ex_ee_block + ex_ee_len - 1) { EXT4_ERROR_INODE(inode, "can not handle truncate %u:%u " "on extent %u:%u", start, end, ex_ee_block, ex_ee_block + ex_ee_len - 1); err = -EFSCORRUPTED; goto out; } else if (a != ex_ee_block) { /* remove tail of the extent */ num = a - ex_ee_block; } else { /* remove whole extent: excellent! */ num = 0; } /* * 3 for leaf, sb, and inode plus 2 (bmap and group * descriptor) for each block group; assume two block * groups plus ex_ee_len/blocks_per_block_group for * the worst case */ credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb)); if (ex == EXT_FIRST_EXTENT(eh)) { correct_index = 1; credits += (ext_depth(inode)) + 1; } credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); /* * We may end up freeing some index blocks and data from the * punched range. Note that partial clusters are accounted for * by ext4_free_data_revoke_credits(). */ revoke_credits = ext4_free_metadata_revoke_credits(inode->i_sb, ext_depth(inode)) + ext4_free_data_revoke_credits(inode, b - a + 1); err = ext4_datasem_ensure_credits(handle, inode, credits, credits, revoke_credits); if (err) { if (err > 0) err = -EAGAIN; goto out; } err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out; err = ext4_remove_blocks(handle, inode, ex, partial, a, b); if (err) goto out; if (num == 0) /* this extent is removed; mark slot entirely unused */ ext4_ext_store_pblock(ex, 0); ex->ee_len = cpu_to_le16(num); /* * Do not mark unwritten if all the blocks in the * extent have been removed. */ if (unwritten && num) ext4_ext_mark_unwritten(ex); /* * If the extent was completely released, * we need to remove it from the leaf */ if (num == 0) { if (end != EXT_MAX_BLOCKS - 1) { /* * For hole punching, we need to scoot all the * extents up when an extent is removed so that * we dont have blank extents in the middle */ memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) * sizeof(struct ext4_extent)); /* Now get rid of the one at the end */ memset(EXT_LAST_EXTENT(eh), 0, sizeof(struct ext4_extent)); } le16_add_cpu(&eh->eh_entries, -1); } err = ext4_ext_dirty(handle, inode, path + depth); if (err) goto out; ext_debug(inode, "new extent: %u:%u:%llu\n", ex_ee_block, num, ext4_ext_pblock(ex)); ex--; ex_ee_block = le32_to_cpu(ex->ee_block); ex_ee_len = ext4_ext_get_actual_len(ex); } if (correct_index && eh->eh_entries) err = ext4_ext_correct_indexes(handle, inode, path); /* * If there's a partial cluster and at least one extent remains in * the leaf, free the partial cluster if it isn't shared with the * current extent. If it is shared with the current extent * we reset the partial cluster because we've reached the start of the * truncated/punched region and we're done removing blocks. */ if (partial->state == tofree && ex >= EXT_FIRST_EXTENT(eh)) { pblk = ext4_ext_pblock(ex) + ex_ee_len - 1; if (partial->pclu != EXT4_B2C(sbi, pblk)) { int flags = get_default_free_blocks_flags(inode); if (ext4_is_pending(inode, partial->lblk)) flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; ext4_free_blocks(handle, inode, NULL, EXT4_C2B(sbi, partial->pclu), sbi->s_cluster_ratio, flags); if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) ext4_rereserve_cluster(inode, partial->lblk); } partial->state = initial; } /* if this leaf is free, then we should * remove it from index block above */ if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) err = ext4_ext_rm_idx(handle, inode, path, depth); out: return err; } /* * ext4_ext_more_to_rm: * returns 1 if current index has to be freed (even partial) */ static int ext4_ext_more_to_rm(struct ext4_ext_path *path) { BUG_ON(path->p_idx == NULL); if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr)) return 0; /* * if truncate on deeper level happened, it wasn't partial, * so we have to consider current index for truncation */ if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block) return 0; return 1; } int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, ext4_lblk_t end) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int depth = ext_depth(inode); struct ext4_ext_path *path = NULL; struct partial_cluster partial; handle_t *handle; int i = 0, err = 0; partial.pclu = 0; partial.lblk = 0; partial.state = initial; ext_debug(inode, "truncate since %u to %u\n", start, end); /* probably first extent we're gonna free will be last in block */ handle = ext4_journal_start_with_revoke(inode, EXT4_HT_TRUNCATE, depth + 1, ext4_free_metadata_revoke_credits(inode->i_sb, depth)); if (IS_ERR(handle)) return PTR_ERR(handle); again: trace_ext4_ext_remove_space(inode, start, end, depth); /* * Check if we are removing extents inside the extent tree. If that * is the case, we are going to punch a hole inside the extent tree * so we have to check whether we need to split the extent covering * the last block to remove so we can easily remove the part of it * in ext4_ext_rm_leaf(). */ if (end < EXT_MAX_BLOCKS - 1) { struct ext4_extent *ex; ext4_lblk_t ee_block, ex_end, lblk; ext4_fsblk_t pblk; /* find extent for or closest extent to this block */ path = ext4_find_extent(inode, end, NULL, EXT4_EX_NOCACHE | EXT4_EX_NOFAIL); if (IS_ERR(path)) { ext4_journal_stop(handle); return PTR_ERR(path); } depth = ext_depth(inode); /* Leaf not may not exist only if inode has no blocks at all */ ex = path[depth].p_ext; if (!ex) { if (depth) { EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); err = -EFSCORRUPTED; } goto out; } ee_block = le32_to_cpu(ex->ee_block); ex_end = ee_block + ext4_ext_get_actual_len(ex) - 1; /* * See if the last block is inside the extent, if so split * the extent at 'end' block so we can easily remove the * tail of the first part of the split extent in * ext4_ext_rm_leaf(). */ if (end >= ee_block && end < ex_end) { /* * If we're going to split the extent, note that * the cluster containing the block after 'end' is * in use to avoid freeing it when removing blocks. */ if (sbi->s_cluster_ratio > 1) { pblk = ext4_ext_pblock(ex) + end - ee_block + 1; partial.pclu = EXT4_B2C(sbi, pblk); partial.state = nofree; } /* * Split the extent in two so that 'end' is the last * block in the first new extent. Also we should not * fail removing space due to ENOSPC so try to use * reserved block if that happens. */ err = ext4_force_split_extent_at(handle, inode, &path, end + 1, 1); if (err < 0) goto out; } else if (sbi->s_cluster_ratio > 1 && end >= ex_end && partial.state == initial) { /* * If we're punching, there's an extent to the right. * If the partial cluster hasn't been set, set it to * that extent's first cluster and its state to nofree * so it won't be freed should it contain blocks to be * removed. If it's already set (tofree/nofree), we're * retrying and keep the original partial cluster info * so a cluster marked tofree as a result of earlier * extent removal is not lost. */ lblk = ex_end + 1; err = ext4_ext_search_right(inode, path, &lblk, &pblk, NULL); if (err < 0) goto out; if (pblk) { partial.pclu = EXT4_B2C(sbi, pblk); partial.state = nofree; } } } /* * We start scanning from right side, freeing all the blocks * after i_size and walking into the tree depth-wise. */ depth = ext_depth(inode); if (path) { int k = i = depth; while (--k > 0) path[k].p_block = le16_to_cpu(path[k].p_hdr->eh_entries)+1; } else { path = kcalloc(depth + 1, sizeof(struct ext4_ext_path), GFP_NOFS | __GFP_NOFAIL); if (path == NULL) { ext4_journal_stop(handle); return -ENOMEM; } path[0].p_maxdepth = path[0].p_depth = depth; path[0].p_hdr = ext_inode_hdr(inode); i = 0; if (ext4_ext_check(inode, path[0].p_hdr, depth, 0)) { err = -EFSCORRUPTED; goto out; } } err = 0; while (i >= 0 && err == 0) { if (i == depth) { /* this is leaf block */ err = ext4_ext_rm_leaf(handle, inode, path, &partial, start, end); /* root level has p_bh == NULL, brelse() eats this */ brelse(path[i].p_bh); path[i].p_bh = NULL; i--; continue; } /* this is index block */ if (!path[i].p_hdr) { ext_debug(inode, "initialize header\n"); path[i].p_hdr = ext_block_hdr(path[i].p_bh); } if (!path[i].p_idx) { /* this level hasn't been touched yet */ path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr); path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1; ext_debug(inode, "init index ptr: hdr 0x%p, num %d\n", path[i].p_hdr, le16_to_cpu(path[i].p_hdr->eh_entries)); } else { /* we were already here, see at next index */ path[i].p_idx--; } ext_debug(inode, "level %d - index, first 0x%p, cur 0x%p\n", i, EXT_FIRST_INDEX(path[i].p_hdr), path[i].p_idx); if (ext4_ext_more_to_rm(path + i)) { struct buffer_head *bh; /* go to the next level */ ext_debug(inode, "move to level %d (block %llu)\n", i + 1, ext4_idx_pblock(path[i].p_idx)); memset(path + i + 1, 0, sizeof(*path)); bh = read_extent_tree_block(inode, path[i].p_idx, depth - i - 1, EXT4_EX_NOCACHE); if (IS_ERR(bh)) { /* should we reset i_size? */ err = PTR_ERR(bh); break; } /* Yield here to deal with large extent trees. * Should be a no-op if we did IO above. */ cond_resched(); if (WARN_ON(i + 1 > depth)) { err = -EFSCORRUPTED; break; } path[i + 1].p_bh = bh; /* save actual number of indexes since this * number is changed at the next iteration */ path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries); i++; } else { /* we finished processing this index, go up */ if (path[i].p_hdr->eh_entries == 0 && i > 0) { /* index is empty, remove it; * handle must be already prepared by the * truncatei_leaf() */ err = ext4_ext_rm_idx(handle, inode, path, i); } /* root level has p_bh == NULL, brelse() eats this */ brelse(path[i].p_bh); path[i].p_bh = NULL; i--; ext_debug(inode, "return to level %d\n", i); } } trace_ext4_ext_remove_space_done(inode, start, end, depth, &partial, path->p_hdr->eh_entries); /* * if there's a partial cluster and we have removed the first extent * in the file, then we also free the partial cluster, if any */ if (partial.state == tofree && err == 0) { int flags = get_default_free_blocks_flags(inode); if (ext4_is_pending(inode, partial.lblk)) flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; ext4_free_blocks(handle, inode, NULL, EXT4_C2B(sbi, partial.pclu), sbi->s_cluster_ratio, flags); if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) ext4_rereserve_cluster(inode, partial.lblk); partial.state = initial; } /* TODO: flexible tree reduction should be here */ if (path->p_hdr->eh_entries == 0) { /* * truncate to zero freed all the tree, * so we need to correct eh_depth */ err = ext4_ext_get_access(handle, inode, path); if (err == 0) { ext_inode_hdr(inode)->eh_depth = 0; ext_inode_hdr(inode)->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0)); err = ext4_ext_dirty(handle, inode, path); } } out: ext4_free_ext_path(path); path = NULL; if (err == -EAGAIN) goto again; ext4_journal_stop(handle); return err; } /* * called at mount time */ void ext4_ext_init(struct super_block *sb) { /* * possible initialization would be here */ if (ext4_has_feature_extents(sb)) { #if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS) printk(KERN_INFO "EXT4-fs: file extents enabled" #ifdef AGGRESSIVE_TEST ", aggressive tests" #endif #ifdef CHECK_BINSEARCH ", check binsearch" #endif #ifdef EXTENTS_STATS ", stats" #endif "\n"); #endif #ifdef EXTENTS_STATS spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock); EXT4_SB(sb)->s_ext_min = 1 << 30; EXT4_SB(sb)->s_ext_max = 0; #endif } } /* * called at umount time */ void ext4_ext_release(struct super_block *sb) { if (!ext4_has_feature_extents(sb)) return; #ifdef EXTENTS_STATS if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) { struct ext4_sb_info *sbi = EXT4_SB(sb); printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n", sbi->s_ext_blocks, sbi->s_ext_extents, sbi->s_ext_blocks / sbi->s_ext_extents); printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n", sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max); } #endif } static void ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex) { ext4_lblk_t ee_block; ext4_fsblk_t ee_pblock; unsigned int ee_len; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); ee_pblock = ext4_ext_pblock(ex); if (ee_len == 0) return; ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock, EXTENT_STATUS_WRITTEN); } /* FIXME!! we need to try to merge to left or right after zero-out */ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) { ext4_fsblk_t ee_pblock; unsigned int ee_len; ee_len = ext4_ext_get_actual_len(ex); ee_pblock = ext4_ext_pblock(ex); return ext4_issue_zeroout(inode, le32_to_cpu(ex->ee_block), ee_pblock, ee_len); } /* * ext4_split_extent_at() splits an extent at given block. * * @handle: the journal handle * @inode: the file inode * @path: the path to the extent * @split: the logical block where the extent is splitted. * @split_flags: indicates if the extent could be zeroout if split fails, and * the states(init or unwritten) of new extents. * @flags: flags used to insert new extent to extent tree. * * * Splits extent [a, b] into two extents [a, @split) and [@split, b], states * of which are determined by split_flag. * * There are two cases: * a> the extent are splitted into two extent. * b> split is not needed, and just mark the extent. * * return 0 on success. */ static int ext4_split_extent_at(handle_t *handle, struct inode *inode, struct ext4_ext_path **ppath, ext4_lblk_t split, int split_flag, int flags) { struct ext4_ext_path *path = *ppath; ext4_fsblk_t newblock; ext4_lblk_t ee_block; struct ext4_extent *ex, newex, orig_ex, zero_ex; struct ext4_extent *ex2 = NULL; unsigned int ee_len, depth; int err = 0; BUG_ON((split_flag & (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)) == (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)); ext_debug(inode, "logical block %llu\n", (unsigned long long)split); ext4_ext_show_leaf(inode, path); depth = ext_depth(inode); ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); newblock = split - ee_block + ext4_ext_pblock(ex); BUG_ON(split < ee_block || split >= (ee_block + ee_len)); BUG_ON(!ext4_ext_is_unwritten(ex) && split_flag & (EXT4_EXT_MAY_ZEROOUT | EXT4_EXT_MARK_UNWRIT1 | EXT4_EXT_MARK_UNWRIT2)); err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out; if (split == ee_block) { /* * case b: block @split is the block that the extent begins with * then we just change the state of the extent, and splitting * is not needed. */ if (split_flag & EXT4_EXT_MARK_UNWRIT2) ext4_ext_mark_unwritten(ex); else ext4_ext_mark_initialized(ex); if (!(flags & EXT4_GET_BLOCKS_PRE_IO)) ext4_ext_try_to_merge(handle, inode, path, ex); err = ext4_ext_dirty(handle, inode, path + path->p_depth); goto out; } /* case a */ memcpy(&orig_ex, ex, sizeof(orig_ex)); ex->ee_len = cpu_to_le16(split - ee_block); if (split_flag & EXT4_EXT_MARK_UNWRIT1) ext4_ext_mark_unwritten(ex); /* * path may lead to new leaf, not to original leaf any more * after ext4_ext_insert_extent() returns, */ err = ext4_ext_dirty(handle, inode, path + depth); if (err) goto fix_extent_len; ex2 = &newex; ex2->ee_block = cpu_to_le32(split); ex2->ee_len = cpu_to_le16(ee_len - (split - ee_block)); ext4_ext_store_pblock(ex2, newblock); if (split_flag & EXT4_EXT_MARK_UNWRIT2) ext4_ext_mark_unwritten(ex2); err = ext4_ext_insert_extent(handle, inode, ppath, &newex, flags); if (err != -ENOSPC && err != -EDQUOT && err != -ENOMEM) goto out; if (EXT4_EXT_MAY_ZEROOUT & split_flag) { if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) { if (split_flag & EXT4_EXT_DATA_VALID1) { err = ext4_ext_zeroout(inode, ex2); zero_ex.ee_block = ex2->ee_block; zero_ex.ee_len = cpu_to_le16( ext4_ext_get_actual_len(ex2)); ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex2)); } else { err = ext4_ext_zeroout(inode, ex); zero_ex.ee_block = ex->ee_block; zero_ex.ee_len = cpu_to_le16( ext4_ext_get_actual_len(ex)); ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex)); } } else { err = ext4_ext_zeroout(inode, &orig_ex); zero_ex.ee_block = orig_ex.ee_block; zero_ex.ee_len = cpu_to_le16( ext4_ext_get_actual_len(&orig_ex)); ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(&orig_ex)); } if (!err) { /* update the extent length and mark as initialized */ ex->ee_len = cpu_to_le16(ee_len); ext4_ext_try_to_merge(handle, inode, path, ex); err = ext4_ext_dirty(handle, inode, path + path->p_depth); if (!err) /* update extent status tree */ ext4_zeroout_es(inode, &zero_ex); /* If we failed at this point, we don't know in which * state the extent tree exactly is so don't try to fix * length of the original extent as it may do even more * damage. */ goto out; } } fix_extent_len: ex->ee_len = orig_ex.ee_len; /* * Ignore ext4_ext_dirty return value since we are already in error path * and err is a non-zero error code. */ ext4_ext_dirty(handle, inode, path + path->p_depth); return err; out: ext4_ext_show_leaf(inode, path); return err; } /* * ext4_split_extents() splits an extent and mark extent which is covered * by @map as split_flags indicates * * It may result in splitting the extent into multiple extents (up to three) * There are three possibilities: * a> There is no split required * b> Splits in two extents: Split is happening at either end of the extent * c> Splits in three extents: Somone is splitting in middle of the extent * */ static int ext4_split_extent(handle_t *handle, struct inode *inode, struct ext4_ext_path **ppath, struct ext4_map_blocks *map, int split_flag, int flags) { struct ext4_ext_path *path = *ppath; ext4_lblk_t ee_block; struct ext4_extent *ex; unsigned int ee_len, depth; int err = 0; int unwritten; int split_flag1, flags1; int allocated = map->m_len; depth = ext_depth(inode); ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); unwritten = ext4_ext_is_unwritten(ex); if (map->m_lblk + map->m_len < ee_block + ee_len) { split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT; flags1 = flags | EXT4_GET_BLOCKS_PRE_IO; if (unwritten) split_flag1 |= EXT4_EXT_MARK_UNWRIT1 | EXT4_EXT_MARK_UNWRIT2; if (split_flag & EXT4_EXT_DATA_VALID2) split_flag1 |= EXT4_EXT_DATA_VALID1; err = ext4_split_extent_at(handle, inode, ppath, map->m_lblk + map->m_len, split_flag1, flags1); if (err) goto out; } else { allocated = ee_len - (map->m_lblk - ee_block); } /* * Update path is required because previous ext4_split_extent_at() may * result in split of original leaf or extent zeroout. */ path = ext4_find_extent(inode, map->m_lblk, ppath, flags); if (IS_ERR(path)) return PTR_ERR(path); depth = ext_depth(inode); ex = path[depth].p_ext; if (!ex) { EXT4_ERROR_INODE(inode, "unexpected hole at %lu", (unsigned long) map->m_lblk); return -EFSCORRUPTED; } unwritten = ext4_ext_is_unwritten(ex); if (map->m_lblk >= ee_block) { split_flag1 = split_flag & EXT4_EXT_DATA_VALID2; if (unwritten) { split_flag1 |= EXT4_EXT_MARK_UNWRIT1; split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT | EXT4_EXT_MARK_UNWRIT2); } err = ext4_split_extent_at(handle, inode, ppath, map->m_lblk, split_flag1, flags); if (err) goto out; } ext4_ext_show_leaf(inode, path); out: return err ? err : allocated; } /* * This function is called by ext4_ext_map_blocks() if someone tries to write * to an unwritten extent. It may result in splitting the unwritten * extent into multiple extents (up to three - one initialized and two * unwritten). * There are three possibilities: * a> There is no split required: Entire extent should be initialized * b> Splits in two extents: Write is happening at either end of the extent * c> Splits in three extents: Somone is writing in middle of the extent * * Pre-conditions: * - The extent pointed to by 'path' is unwritten. * - The extent pointed to by 'path' contains a superset * of the logical span [map->m_lblk, map->m_lblk + map->m_len). * * Post-conditions on success: * - the returned value is the number of blocks beyond map->l_lblk * that are allocated and initialized. * It is guaranteed to be >= map->m_len. */ static int ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, struct ext4_ext_path **ppath, int flags) { struct ext4_ext_path *path = *ppath; struct ext4_sb_info *sbi; struct ext4_extent_header *eh; struct ext4_map_blocks split_map; struct ext4_extent zero_ex1, zero_ex2; struct ext4_extent *ex, *abut_ex; ext4_lblk_t ee_block, eof_block; unsigned int ee_len, depth, map_len = map->m_len; int allocated = 0, max_zeroout = 0; int err = 0; int split_flag = EXT4_EXT_DATA_VALID2; ext_debug(inode, "logical block %llu, max_blocks %u\n", (unsigned long long)map->m_lblk, map_len); sbi = EXT4_SB(inode->i_sb); eof_block = (EXT4_I(inode)->i_disksize + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits; if (eof_block < map->m_lblk + map_len) eof_block = map->m_lblk + map_len; depth = ext_depth(inode); eh = path[depth].p_hdr; ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); zero_ex1.ee_len = 0; zero_ex2.ee_len = 0; trace_ext4_ext_convert_to_initialized_enter(inode, map, ex); /* Pre-conditions */ BUG_ON(!ext4_ext_is_unwritten(ex)); BUG_ON(!in_range(map->m_lblk, ee_block, ee_len)); /* * Attempt to transfer newly initialized blocks from the currently * unwritten extent to its neighbor. This is much cheaper * than an insertion followed by a merge as those involve costly * memmove() calls. Transferring to the left is the common case in * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE) * followed by append writes. * * Limitations of the current logic: * - L1: we do not deal with writes covering the whole extent. * This would require removing the extent if the transfer * is possible. * - L2: we only attempt to merge with an extent stored in the * same extent tree node. */ if ((map->m_lblk == ee_block) && /* See if we can merge left */ (map_len < ee_len) && /*L1*/ (ex > EXT_FIRST_EXTENT(eh))) { /*L2*/ ext4_lblk_t prev_lblk; ext4_fsblk_t prev_pblk, ee_pblk; unsigned int prev_len; abut_ex = ex - 1; prev_lblk = le32_to_cpu(abut_ex->ee_block); prev_len = ext4_ext_get_actual_len(abut_ex); prev_pblk = ext4_ext_pblock(abut_ex); ee_pblk = ext4_ext_pblock(ex); /* * A transfer of blocks from 'ex' to 'abut_ex' is allowed * upon those conditions: * - C1: abut_ex is initialized, * - C2: abut_ex is logically abutting ex, * - C3: abut_ex is physically abutting ex, * - C4: abut_ex can receive the additional blocks without * overflowing the (initialized) length limit. */ if ((!ext4_ext_is_unwritten(abut_ex)) && /*C1*/ ((prev_lblk + prev_len) == ee_block) && /*C2*/ ((prev_pblk + prev_len) == ee_pblk) && /*C3*/ (prev_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/ err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out; trace_ext4_ext_convert_to_initialized_fastpath(inode, map, ex, abut_ex); /* Shift the start of ex by 'map_len' blocks */ ex->ee_block = cpu_to_le32(ee_block + map_len); ext4_ext_store_pblock(ex, ee_pblk + map_len); ex->ee_len = cpu_to_le16(ee_len - map_len); ext4_ext_mark_unwritten(ex); /* Restore the flag */ /* Extend abut_ex by 'map_len' blocks */ abut_ex->ee_len = cpu_to_le16(prev_len + map_len); /* Result: number of initialized blocks past m_lblk */ allocated = map_len; } } else if (((map->m_lblk + map_len) == (ee_block + ee_len)) && (map_len < ee_len) && /*L1*/ ex < EXT_LAST_EXTENT(eh)) { /*L2*/ /* See if we can merge right */ ext4_lblk_t next_lblk; ext4_fsblk_t next_pblk, ee_pblk; unsigned int next_len; abut_ex = ex + 1; next_lblk = le32_to_cpu(abut_ex->ee_block); next_len = ext4_ext_get_actual_len(abut_ex); next_pblk = ext4_ext_pblock(abut_ex); ee_pblk = ext4_ext_pblock(ex); /* * A transfer of blocks from 'ex' to 'abut_ex' is allowed * upon those conditions: * - C1: abut_ex is initialized, * - C2: abut_ex is logically abutting ex, * - C3: abut_ex is physically abutting ex, * - C4: abut_ex can receive the additional blocks without * overflowing the (initialized) length limit. */ if ((!ext4_ext_is_unwritten(abut_ex)) && /*C1*/ ((map->m_lblk + map_len) == next_lblk) && /*C2*/ ((ee_pblk + ee_len) == next_pblk) && /*C3*/ (next_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/ err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out; trace_ext4_ext_convert_to_initialized_fastpath(inode, map, ex, abut_ex); /* Shift the start of abut_ex by 'map_len' blocks */ abut_ex->ee_block = cpu_to_le32(next_lblk - map_len); ext4_ext_store_pblock(abut_ex, next_pblk - map_len); ex->ee_len = cpu_to_le16(ee_len - map_len); ext4_ext_mark_unwritten(ex); /* Restore the flag */ /* Extend abut_ex by 'map_len' blocks */ abut_ex->ee_len = cpu_to_le16(next_len + map_len); /* Result: number of initialized blocks past m_lblk */ allocated = map_len; } } if (allocated) { /* Mark the block containing both extents as dirty */ err = ext4_ext_dirty(handle, inode, path + depth); /* Update path to point to the right extent */ path[depth].p_ext = abut_ex; goto out; } else allocated = ee_len - (map->m_lblk - ee_block); WARN_ON(map->m_lblk < ee_block); /* * It is safe to convert extent to initialized via explicit * zeroout only if extent is fully inside i_size or new_size. */ split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; if (EXT4_EXT_MAY_ZEROOUT & split_flag) max_zeroout = sbi->s_extent_max_zeroout_kb >> (inode->i_sb->s_blocksize_bits - 10); /* * five cases: * 1. split the extent into three extents. * 2. split the extent into two extents, zeroout the head of the first * extent. * 3. split the extent into two extents, zeroout the tail of the second * extent. * 4. split the extent into two extents with out zeroout. * 5. no splitting needed, just possibly zeroout the head and / or the * tail of the extent. */ split_map.m_lblk = map->m_lblk; split_map.m_len = map->m_len; if (max_zeroout && (allocated > split_map.m_len)) { if (allocated <= max_zeroout) { /* case 3 or 5 */ zero_ex1.ee_block = cpu_to_le32(split_map.m_lblk + split_map.m_len); zero_ex1.ee_len = cpu_to_le16(allocated - split_map.m_len); ext4_ext_store_pblock(&zero_ex1, ext4_ext_pblock(ex) + split_map.m_lblk + split_map.m_len - ee_block); err = ext4_ext_zeroout(inode, &zero_ex1); if (err) goto fallback; split_map.m_len = allocated; } if (split_map.m_lblk - ee_block + split_map.m_len < max_zeroout) { /* case 2 or 5 */ if (split_map.m_lblk != ee_block) { zero_ex2.ee_block = ex->ee_block; zero_ex2.ee_len = cpu_to_le16(split_map.m_lblk - ee_block); ext4_ext_store_pblock(&zero_ex2, ext4_ext_pblock(ex)); err = ext4_ext_zeroout(inode, &zero_ex2); if (err) goto fallback; } split_map.m_len += split_map.m_lblk - ee_block; split_map.m_lblk = ee_block; allocated = map->m_len; } } fallback: err = ext4_split_extent(handle, inode, ppath, &split_map, split_flag, flags); if (err > 0) err = 0; out: /* If we have gotten a failure, don't zero out status tree */ if (!err) { ext4_zeroout_es(inode, &zero_ex1); ext4_zeroout_es(inode, &zero_ex2); } return err ? err : allocated; } /* * This function is called by ext4_ext_map_blocks() from * ext4_get_blocks_dio_write() when DIO to write * to an unwritten extent. * * Writing to an unwritten extent may result in splitting the unwritten * extent into multiple initialized/unwritten extents (up to three) * There are three possibilities: * a> There is no split required: Entire extent should be unwritten * b> Splits in two extents: Write is happening at either end of the extent * c> Splits in three extents: Somone is writing in middle of the extent * * This works the same way in the case of initialized -> unwritten conversion. * * One of more index blocks maybe needed if the extent tree grow after * the unwritten extent split. To prevent ENOSPC occur at the IO * complete, we need to split the unwritten extent before DIO submit * the IO. The unwritten extent called at this time will be split * into three unwritten extent(at most). After IO complete, the part * being filled will be convert to initialized by the end_io callback function * via ext4_convert_unwritten_extents(). * * Returns the size of unwritten extent to be written on success. */ static int ext4_split_convert_extents(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, struct ext4_ext_path **ppath, int flags) { struct ext4_ext_path *path = *ppath; ext4_lblk_t eof_block; ext4_lblk_t ee_block; struct ext4_extent *ex; unsigned int ee_len; int split_flag = 0, depth; ext_debug(inode, "logical block %llu, max_blocks %u\n", (unsigned long long)map->m_lblk, map->m_len); eof_block = (EXT4_I(inode)->i_disksize + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits; if (eof_block < map->m_lblk + map->m_len) eof_block = map->m_lblk + map->m_len; /* * It is safe to convert extent to initialized via explicit * zeroout only if extent is fully inside i_size or new_size. */ depth = ext_depth(inode); ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); /* Convert to unwritten */ if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) { split_flag |= EXT4_EXT_DATA_VALID1; /* Convert to initialized */ } else if (flags & EXT4_GET_BLOCKS_CONVERT) { split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2); } flags |= EXT4_GET_BLOCKS_PRE_IO; return ext4_split_extent(handle, inode, ppath, map, split_flag, flags); } static int ext4_convert_unwritten_extents_endio(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, struct ext4_ext_path **ppath) { struct ext4_ext_path *path = *ppath; struct ext4_extent *ex; ext4_lblk_t ee_block; unsigned int ee_len; int depth; int err = 0; depth = ext_depth(inode); ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); ext_debug(inode, "logical block %llu, max_blocks %u\n", (unsigned long long)ee_block, ee_len); /* If extent is larger than requested it is a clear sign that we still * have some extent state machine issues left. So extent_split is still * required. * TODO: Once all related issues will be fixed this situation should be * illegal. */ if (ee_block != map->m_lblk || ee_len > map->m_len) { #ifdef CONFIG_EXT4_DEBUG ext4_warning(inode->i_sb, "Inode (%ld) finished: extent logical block %llu," " len %u; IO logical block %llu, len %u", inode->i_ino, (unsigned long long)ee_block, ee_len, (unsigned long long)map->m_lblk, map->m_len); #endif err = ext4_split_convert_extents(handle, inode, map, ppath, EXT4_GET_BLOCKS_CONVERT); if (err < 0) return err; path = ext4_find_extent(inode, map->m_lblk, ppath, 0); if (IS_ERR(path)) return PTR_ERR(path); depth = ext_depth(inode); ex = path[depth].p_ext; } err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out; /* first mark the extent as initialized */ ext4_ext_mark_initialized(ex); /* note: ext4_ext_correct_indexes() isn't needed here because * borders are not changed */ ext4_ext_try_to_merge(handle, inode, path, ex); /* Mark modified extent as dirty */ err = ext4_ext_dirty(handle, inode, path + path->p_depth); out: ext4_ext_show_leaf(inode, path); return err; } static int convert_initialized_extent(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, struct ext4_ext_path **ppath, unsigned int *allocated) { struct ext4_ext_path *path = *ppath; struct ext4_extent *ex; ext4_lblk_t ee_block; unsigned int ee_len; int depth; int err = 0; /* * Make sure that the extent is no bigger than we support with * unwritten extent */ if (map->m_len > EXT_UNWRITTEN_MAX_LEN) map->m_len = EXT_UNWRITTEN_MAX_LEN / 2; depth = ext_depth(inode); ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); ext_debug(inode, "logical block %llu, max_blocks %u\n", (unsigned long long)ee_block, ee_len); if (ee_block != map->m_lblk || ee_len > map->m_len) { err = ext4_split_convert_extents(handle, inode, map, ppath, EXT4_GET_BLOCKS_CONVERT_UNWRITTEN); if (err < 0) return err; path = ext4_find_extent(inode, map->m_lblk, ppath, 0); if (IS_ERR(path)) return PTR_ERR(path); depth = ext_depth(inode); ex = path[depth].p_ext; if (!ex) { EXT4_ERROR_INODE(inode, "unexpected hole at %lu", (unsigned long) map->m_lblk); return -EFSCORRUPTED; } } err = ext4_ext_get_access(handle, inode, path + depth); if (err) return err; /* first mark the extent as unwritten */ ext4_ext_mark_unwritten(ex); /* note: ext4_ext_correct_indexes() isn't needed here because * borders are not changed */ ext4_ext_try_to_merge(handle, inode, path, ex); /* Mark modified extent as dirty */ err = ext4_ext_dirty(handle, inode, path + path->p_depth); if (err) return err; ext4_ext_show_leaf(inode, path); ext4_update_inode_fsync_trans(handle, inode, 1); map->m_flags |= EXT4_MAP_UNWRITTEN; if (*allocated > map->m_len) *allocated = map->m_len; map->m_len = *allocated; return 0; } static int ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, struct ext4_ext_path **ppath, int flags, unsigned int allocated, ext4_fsblk_t newblock) { struct ext4_ext_path __maybe_unused *path = *ppath; int ret = 0; int err = 0; ext_debug(inode, "logical block %llu, max_blocks %u, flags 0x%x, allocated %u\n", (unsigned long long)map->m_lblk, map->m_len, flags, allocated); ext4_ext_show_leaf(inode, path); /* * When writing into unwritten space, we should not fail to * allocate metadata blocks for the new extent block if needed. */ flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL; trace_ext4_ext_handle_unwritten_extents(inode, map, flags, allocated, newblock); /* get_block() before submitting IO, split the extent */ if (flags & EXT4_GET_BLOCKS_PRE_IO) { ret = ext4_split_convert_extents(handle, inode, map, ppath, flags | EXT4_GET_BLOCKS_CONVERT); if (ret < 0) { err = ret; goto out2; } /* * shouldn't get a 0 return when splitting an extent unless * m_len is 0 (bug) or extent has been corrupted */ if (unlikely(ret == 0)) { EXT4_ERROR_INODE(inode, "unexpected ret == 0, m_len = %u", map->m_len); err = -EFSCORRUPTED; goto out2; } map->m_flags |= EXT4_MAP_UNWRITTEN; goto out; } /* IO end_io complete, convert the filled extent to written */ if (flags & EXT4_GET_BLOCKS_CONVERT) { err = ext4_convert_unwritten_extents_endio(handle, inode, map, ppath); if (err < 0) goto out2; ext4_update_inode_fsync_trans(handle, inode, 1); goto map_out; } /* buffered IO cases */ /* * repeat fallocate creation request * we already have an unwritten extent */ if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) { map->m_flags |= EXT4_MAP_UNWRITTEN; goto map_out; } /* buffered READ or buffered write_begin() lookup */ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { /* * We have blocks reserved already. We * return allocated blocks so that delalloc * won't do block reservation for us. But * the buffer head will be unmapped so that * a read from the block returns 0s. */ map->m_flags |= EXT4_MAP_UNWRITTEN; goto out1; } /* * Default case when (flags & EXT4_GET_BLOCKS_CREATE) == 1. * For buffered writes, at writepage time, etc. Convert a * discovered unwritten extent to written. */ ret = ext4_ext_convert_to_initialized(handle, inode, map, ppath, flags); if (ret < 0) { err = ret; goto out2; } ext4_update_inode_fsync_trans(handle, inode, 1); /* * shouldn't get a 0 return when converting an unwritten extent * unless m_len is 0 (bug) or extent has been corrupted */ if (unlikely(ret == 0)) { EXT4_ERROR_INODE(inode, "unexpected ret == 0, m_len = %u", map->m_len); err = -EFSCORRUPTED; goto out2; } out: allocated = ret; map->m_flags |= EXT4_MAP_NEW; map_out: map->m_flags |= EXT4_MAP_MAPPED; out1: map->m_pblk = newblock; if (allocated > map->m_len) allocated = map->m_len; map->m_len = allocated; ext4_ext_show_leaf(inode, path); out2: return err ? err : allocated; } /* * get_implied_cluster_alloc - check to see if the requested * allocation (in the map structure) overlaps with a cluster already * allocated in an extent. * @sb The filesystem superblock structure * @map The requested lblk->pblk mapping * @ex The extent structure which might contain an implied * cluster allocation * * This function is called by ext4_ext_map_blocks() after we failed to * find blocks that were already in the inode's extent tree. Hence, * we know that the beginning of the requested region cannot overlap * the extent from the inode's extent tree. There are three cases we * want to catch. The first is this case: * * |--- cluster # N--| * |--- extent ---| |---- requested region ---| * |==========| * * The second case that we need to test for is this one: * * |--------- cluster # N ----------------| * |--- requested region --| |------- extent ----| * |=======================| * * The third case is when the requested region lies between two extents * within the same cluster: * |------------- cluster # N-------------| * |----- ex -----| |---- ex_right ----| * |------ requested region ------| * |================| * * In each of the above cases, we need to set the map->m_pblk and * map->m_len so it corresponds to the return the extent labelled as * "|====|" from cluster #N, since it is already in use for data in * cluster EXT4_B2C(sbi, map->m_lblk). We will then return 1 to * signal to ext4_ext_map_blocks() that map->m_pblk should be treated * as a new "allocated" block region. Otherwise, we will return 0 and * ext4_ext_map_blocks() will then allocate one or more new clusters * by calling ext4_mb_new_blocks(). */ static int get_implied_cluster_alloc(struct super_block *sb, struct ext4_map_blocks *map, struct ext4_extent *ex, struct ext4_ext_path *path) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_lblk_t c_offset = EXT4_LBLK_COFF(sbi, map->m_lblk); ext4_lblk_t ex_cluster_start, ex_cluster_end; ext4_lblk_t rr_cluster_start; ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); ext4_fsblk_t ee_start = ext4_ext_pblock(ex); unsigned short ee_len = ext4_ext_get_actual_len(ex); /* The extent passed in that we are trying to match */ ex_cluster_start = EXT4_B2C(sbi, ee_block); ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1); /* The requested region passed into ext4_map_blocks() */ rr_cluster_start = EXT4_B2C(sbi, map->m_lblk); if ((rr_cluster_start == ex_cluster_end) || (rr_cluster_start == ex_cluster_start)) { if (rr_cluster_start == ex_cluster_end) ee_start += ee_len - 1; map->m_pblk = EXT4_PBLK_CMASK(sbi, ee_start) + c_offset; map->m_len = min(map->m_len, (unsigned) sbi->s_cluster_ratio - c_offset); /* * Check for and handle this case: * * |--------- cluster # N-------------| * |------- extent ----| * |--- requested region ---| * |===========| */ if (map->m_lblk < ee_block) map->m_len = min(map->m_len, ee_block - map->m_lblk); /* * Check for the case where there is already another allocated * block to the right of 'ex' but before the end of the cluster. * * |------------- cluster # N-------------| * |----- ex -----| |---- ex_right ----| * |------ requested region ------| * |================| */ if (map->m_lblk > ee_block) { ext4_lblk_t next = ext4_ext_next_allocated_block(path); map->m_len = min(map->m_len, next - map->m_lblk); } trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1); return 1; } trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0); return 0; } /* * Determine hole length around the given logical block, first try to * locate and expand the hole from the given @path, and then adjust it * if it's partially or completely converted to delayed extents, insert * it into the extent cache tree if it's indeed a hole, finally return * the length of the determined extent. */ static ext4_lblk_t ext4_ext_determine_insert_hole(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t lblk) { ext4_lblk_t hole_start, len; struct extent_status es; hole_start = lblk; len = ext4_ext_find_hole(inode, path, &hole_start); again: ext4_es_find_extent_range(inode, &ext4_es_is_delayed, hole_start, hole_start + len - 1, &es); if (!es.es_len) goto insert_hole; /* * There's a delalloc extent in the hole, handle it if the delalloc * extent is in front of, behind and straddle the queried range. */ if (lblk >= es.es_lblk + es.es_len) { /* * The delalloc extent is in front of the queried range, * find again from the queried start block. */ len -= lblk - hole_start; hole_start = lblk; goto again; } else if (in_range(lblk, es.es_lblk, es.es_len)) { /* * The delalloc extent containing lblk, it must have been * added after ext4_map_blocks() checked the extent status * tree so we are not holding i_rwsem and delalloc info is * only stabilized by i_data_sem we are going to release * soon. Don't modify the extent status tree and report * extent as a hole, just adjust the length to the delalloc * extent's after lblk. */ len = es.es_lblk + es.es_len - lblk; return len; } else { /* * The delalloc extent is partially or completely behind * the queried range, update hole length until the * beginning of the delalloc extent. */ len = min(es.es_lblk - hole_start, len); } insert_hole: /* Put just found gap into cache to speed up subsequent requests */ ext_debug(inode, " -> %u:%u\n", hole_start, len); ext4_es_insert_extent(inode, hole_start, len, ~0, EXTENT_STATUS_HOLE); /* Update hole_len to reflect hole size after lblk */ if (hole_start != lblk) len -= lblk - hole_start; return len; } /* * Block allocation/map/preallocation routine for extents based files * * * Need to be called with * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) * * return > 0, number of blocks already mapped/allocated * if create == 0 and these are pre-allocated blocks * buffer head is unmapped * otherwise blocks are mapped * * return = 0, if plain look up failed (blocks have not been allocated) * buffer head is unmapped * * return < 0, error case. */ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags) { struct ext4_ext_path *path = NULL; struct ext4_extent newex, *ex, ex2; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ext4_fsblk_t newblock = 0, pblk; int err = 0, depth, ret; unsigned int allocated = 0, offset = 0; unsigned int allocated_clusters = 0; struct ext4_allocation_request ar; ext4_lblk_t cluster_offset; ext_debug(inode, "blocks %u/%u requested\n", map->m_lblk, map->m_len); trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); /* find extent for this block */ path = ext4_find_extent(inode, map->m_lblk, NULL, 0); if (IS_ERR(path)) { err = PTR_ERR(path); path = NULL; goto out; } depth = ext_depth(inode); /* * consistent leaf must not be empty; * this situation is possible, though, _during_ tree modification; * this is why assert can't be put in ext4_find_extent() */ if (unlikely(path[depth].p_ext == NULL && depth != 0)) { EXT4_ERROR_INODE(inode, "bad extent address " "lblock: %lu, depth: %d pblock %lld", (unsigned long) map->m_lblk, depth, path[depth].p_block); err = -EFSCORRUPTED; goto out; } ex = path[depth].p_ext; if (ex) { ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); ext4_fsblk_t ee_start = ext4_ext_pblock(ex); unsigned short ee_len; /* * unwritten extents are treated as holes, except that * we split out initialized portions during a write. */ ee_len = ext4_ext_get_actual_len(ex); trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len); /* if found extent covers block, simply return it */ if (in_range(map->m_lblk, ee_block, ee_len)) { newblock = map->m_lblk - ee_block + ee_start; /* number of remaining blocks in the extent */ allocated = ee_len - (map->m_lblk - ee_block); ext_debug(inode, "%u fit into %u:%d -> %llu\n", map->m_lblk, ee_block, ee_len, newblock); /* * If the extent is initialized check whether the * caller wants to convert it to unwritten. */ if ((!ext4_ext_is_unwritten(ex)) && (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) { err = convert_initialized_extent(handle, inode, map, &path, &allocated); goto out; } else if (!ext4_ext_is_unwritten(ex)) { map->m_flags |= EXT4_MAP_MAPPED; map->m_pblk = newblock; if (allocated > map->m_len) allocated = map->m_len; map->m_len = allocated; ext4_ext_show_leaf(inode, path); goto out; } ret = ext4_ext_handle_unwritten_extents( handle, inode, map, &path, flags, allocated, newblock); if (ret < 0) err = ret; else allocated = ret; goto out; } } /* * requested block isn't allocated yet; * we couldn't try to create block if create flag is zero */ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { ext4_lblk_t len; len = ext4_ext_determine_insert_hole(inode, path, map->m_lblk); map->m_pblk = 0; map->m_len = min_t(unsigned int, map->m_len, len); goto out; } /* * Okay, we need to do block allocation. */ newex.ee_block = cpu_to_le32(map->m_lblk); cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk); /* * If we are doing bigalloc, check to see if the extent returned * by ext4_find_extent() implies a cluster we can use. */ if (cluster_offset && ex && get_implied_cluster_alloc(inode->i_sb, map, ex, path)) { ar.len = allocated = map->m_len; newblock = map->m_pblk; goto got_allocated_blocks; } /* find neighbour allocated blocks */ ar.lleft = map->m_lblk; err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft); if (err) goto out; ar.lright = map->m_lblk; err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2); if (err < 0) goto out; /* Check if the extent after searching to the right implies a * cluster we can use. */ if ((sbi->s_cluster_ratio > 1) && err && get_implied_cluster_alloc(inode->i_sb, map, &ex2, path)) { ar.len = allocated = map->m_len; newblock = map->m_pblk; goto got_allocated_blocks; } /* * See if request is beyond maximum number of blocks we can have in * a single extent. For an initialized extent this limit is * EXT_INIT_MAX_LEN and for an unwritten extent this limit is * EXT_UNWRITTEN_MAX_LEN. */ if (map->m_len > EXT_INIT_MAX_LEN && !(flags & EXT4_GET_BLOCKS_UNWRIT_EXT)) map->m_len = EXT_INIT_MAX_LEN; else if (map->m_len > EXT_UNWRITTEN_MAX_LEN && (flags & EXT4_GET_BLOCKS_UNWRIT_EXT)) map->m_len = EXT_UNWRITTEN_MAX_LEN; /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */ newex.ee_len = cpu_to_le16(map->m_len); err = ext4_ext_check_overlap(sbi, inode, &newex, path); if (err) allocated = ext4_ext_get_actual_len(&newex); else allocated = map->m_len; /* allocate new block */ ar.inode = inode; ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk); ar.logical = map->m_lblk; /* * We calculate the offset from the beginning of the cluster * for the logical block number, since when we allocate a * physical cluster, the physical block should start at the * same offset from the beginning of the cluster. This is * needed so that future calls to get_implied_cluster_alloc() * work correctly. */ offset = EXT4_LBLK_COFF(sbi, map->m_lblk); ar.len = EXT4_NUM_B2C(sbi, offset+allocated); ar.goal -= offset; ar.logical -= offset; if (S_ISREG(inode->i_mode)) ar.flags = EXT4_MB_HINT_DATA; else /* disable in-core preallocation for non-regular files */ ar.flags = 0; if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE) ar.flags |= EXT4_MB_HINT_NOPREALLOC; if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) ar.flags |= EXT4_MB_DELALLOC_RESERVED; if (flags & EXT4_GET_BLOCKS_METADATA_NOFAIL) ar.flags |= EXT4_MB_USE_RESERVED; newblock = ext4_mb_new_blocks(handle, &ar, &err); if (!newblock) goto out; allocated_clusters = ar.len; ar.len = EXT4_C2B(sbi, ar.len) - offset; ext_debug(inode, "allocate new block: goal %llu, found %llu/%u, requested %u\n", ar.goal, newblock, ar.len, allocated); if (ar.len > allocated) ar.len = allocated; got_allocated_blocks: /* try to insert new extent into found leaf and return */ pblk = newblock + offset; ext4_ext_store_pblock(&newex, pblk); newex.ee_len = cpu_to_le16(ar.len); /* Mark unwritten */ if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) { ext4_ext_mark_unwritten(&newex); map->m_flags |= EXT4_MAP_UNWRITTEN; } err = ext4_ext_insert_extent(handle, inode, &path, &newex, flags); if (err) { if (allocated_clusters) { int fb_flags = 0; /* * free data blocks we just allocated. * not a good idea to call discard here directly, * but otherwise we'd need to call it every free(). */ ext4_discard_preallocations(inode); if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) fb_flags = EXT4_FREE_BLOCKS_NO_QUOT_UPDATE; ext4_free_blocks(handle, inode, NULL, newblock, EXT4_C2B(sbi, allocated_clusters), fb_flags); } goto out; } /* * Reduce the reserved cluster count to reflect successful deferred * allocation of delayed allocated clusters or direct allocation of * clusters discovered to be delayed allocated. Once allocated, a * cluster is not included in the reserved count. */ if (test_opt(inode->i_sb, DELALLOC) && allocated_clusters) { if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { /* * When allocating delayed allocated clusters, simply * reduce the reserved cluster count and claim quota */ ext4_da_update_reserve_space(inode, allocated_clusters, 1); } else { ext4_lblk_t lblk, len; unsigned int n; /* * When allocating non-delayed allocated clusters * (from fallocate, filemap, DIO, or clusters * allocated when delalloc has been disabled by * ext4_nonda_switch), reduce the reserved cluster * count by the number of allocated clusters that * have previously been delayed allocated. Quota * has been claimed by ext4_mb_new_blocks() above, * so release the quota reservations made for any * previously delayed allocated clusters. */ lblk = EXT4_LBLK_CMASK(sbi, map->m_lblk); len = allocated_clusters << sbi->s_cluster_bits; n = ext4_es_delayed_clu(inode, lblk, len); if (n > 0) ext4_da_update_reserve_space(inode, (int) n, 0); } } /* * Cache the extent and update transaction to commit on fdatasync only * when it is _not_ an unwritten extent. */ if ((flags & EXT4_GET_BLOCKS_UNWRIT_EXT) == 0) ext4_update_inode_fsync_trans(handle, inode, 1); else ext4_update_inode_fsync_trans(handle, inode, 0); map->m_flags |= (EXT4_MAP_NEW | EXT4_MAP_MAPPED); map->m_pblk = pblk; map->m_len = ar.len; allocated = map->m_len; ext4_ext_show_leaf(inode, path); out: ext4_free_ext_path(path); trace_ext4_ext_map_blocks_exit(inode, flags, map, err ? err : allocated); return err ? err : allocated; } int ext4_ext_truncate(handle_t *handle, struct inode *inode) { struct super_block *sb = inode->i_sb; ext4_lblk_t last_block; int err = 0; /* * TODO: optimization is possible here. * Probably we need not scan at all, * because page truncation is enough. */ /* we have to know where to truncate from in crash case */ EXT4_I(inode)->i_disksize = inode->i_size; err = ext4_mark_inode_dirty(handle, inode); if (err) return err; last_block = (inode->i_size + sb->s_blocksize - 1) >> EXT4_BLOCK_SIZE_BITS(sb); ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block); retry_remove_space: err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); if (err == -ENOMEM) { memalloc_retry_wait(GFP_ATOMIC); goto retry_remove_space; } return err; } static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, ext4_lblk_t len, loff_t new_size, int flags) { struct inode *inode = file_inode(file); handle_t *handle; int ret = 0, ret2 = 0, ret3 = 0; int retries = 0; int depth = 0; struct ext4_map_blocks map; unsigned int credits; loff_t epos; BUG_ON(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)); map.m_lblk = offset; map.m_len = len; /* * Don't normalize the request if it can fit in one extent so * that it doesn't get unnecessarily split into multiple * extents. */ if (len <= EXT_UNWRITTEN_MAX_LEN) flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; /* * credits to insert 1 extent into extent tree */ credits = ext4_chunk_trans_blocks(inode, len); depth = ext_depth(inode); retry: while (len) { /* * Recalculate credits when extent tree depth changes. */ if (depth != ext_depth(inode)) { credits = ext4_chunk_trans_blocks(inode, len); depth = ext_depth(inode); } handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); break; } ret = ext4_map_blocks(handle, inode, &map, flags); if (ret <= 0) { ext4_debug("inode #%lu: block %u: len %u: " "ext4_ext_map_blocks returned %d", inode->i_ino, map.m_lblk, map.m_len, ret); ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); break; } /* * allow a full retry cycle for any remaining allocations */ retries = 0; map.m_lblk += ret; map.m_len = len = len - ret; epos = (loff_t)map.m_lblk << inode->i_blkbits; inode_set_ctime_current(inode); if (new_size) { if (epos > new_size) epos = new_size; if (ext4_update_inode_size(inode, epos) & 0x1) inode_set_mtime_to_ts(inode, inode_get_ctime(inode)); } ret2 = ext4_mark_inode_dirty(handle, inode); ext4_update_inode_fsync_trans(handle, inode, 1); ret3 = ext4_journal_stop(handle); ret2 = ret3 ? ret3 : ret2; if (unlikely(ret2)) break; } if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; return ret > 0 ? ret2 : ret; } static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len); static int ext4_insert_range(struct file *file, loff_t offset, loff_t len); static long ext4_zero_range(struct file *file, loff_t offset, loff_t len, int mode) { struct inode *inode = file_inode(file); struct address_space *mapping = file->f_mapping; handle_t *handle = NULL; unsigned int max_blocks; loff_t new_size = 0; int ret = 0; int flags; int credits; int partial_begin, partial_end; loff_t start, end; ext4_lblk_t lblk; unsigned int blkbits = inode->i_blkbits; trace_ext4_zero_range(inode, offset, len, mode); /* * Round up offset. This is not fallocate, we need to zero out * blocks, so convert interior block aligned part of the range to * unwritten and possibly manually zero out unaligned parts of the * range. Here, start and partial_begin are inclusive, end and * partial_end are exclusive. */ start = round_up(offset, 1 << blkbits); end = round_down((offset + len), 1 << blkbits); if (start < offset || end > offset + len) return -EINVAL; partial_begin = offset & ((1 << blkbits) - 1); partial_end = (offset + len) & ((1 << blkbits) - 1); lblk = start >> blkbits; max_blocks = (end >> blkbits); if (max_blocks < lblk) max_blocks = 0; else max_blocks -= lblk; inode_lock(inode); /* * Indirect files do not support unwritten extents */ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { ret = -EOPNOTSUPP; goto out_mutex; } if (!(mode & FALLOC_FL_KEEP_SIZE) && (offset + len > inode->i_size || offset + len > EXT4_I(inode)->i_disksize)) { new_size = offset + len; ret = inode_newsize_ok(inode, new_size); if (ret) goto out_mutex; } flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; /* Wait all existing dio workers, newcomers will block on i_rwsem */ inode_dio_wait(inode); ret = file_modified(file); if (ret) goto out_mutex; /* Preallocate the range including the unaligned edges */ if (partial_begin || partial_end) { ret = ext4_alloc_file_blocks(file, round_down(offset, 1 << blkbits) >> blkbits, (round_up((offset + len), 1 << blkbits) - round_down(offset, 1 << blkbits)) >> blkbits, new_size, flags); if (ret) goto out_mutex; } /* Zero range excluding the unaligned edges */ if (max_blocks > 0) { flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | EXT4_EX_NOCACHE); /* * Prevent page faults from reinstantiating pages we have * released from page cache. */ filemap_invalidate_lock(mapping); ret = ext4_break_layouts(inode); if (ret) { filemap_invalidate_unlock(mapping); goto out_mutex; } ret = ext4_update_disksize_before_punch(inode, offset, len); if (ret) { filemap_invalidate_unlock(mapping); goto out_mutex; } /* * For journalled data we need to write (and checkpoint) pages * before discarding page cache to avoid inconsitent data on * disk in case of crash before zeroing trans is committed. */ if (ext4_should_journal_data(inode)) { ret = filemap_write_and_wait_range(mapping, start, end - 1); if (ret) { filemap_invalidate_unlock(mapping); goto out_mutex; } } /* Now release the pages and zero block aligned part of pages */ truncate_pagecache_range(inode, start, end - 1); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags); filemap_invalidate_unlock(mapping); if (ret) goto out_mutex; } if (!partial_begin && !partial_end) goto out_mutex; /* * In worst case we have to writeout two nonadjacent unwritten * blocks and update the inode */ credits = (2 * ext4_ext_index_trans_blocks(inode, 2)) + 1; if (ext4_should_journal_data(inode)) credits += 2; handle = ext4_journal_start(inode, EXT4_HT_MISC, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); ext4_std_error(inode->i_sb, ret); goto out_mutex; } inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); if (new_size) ext4_update_inode_size(inode, new_size); ret = ext4_mark_inode_dirty(handle, inode); if (unlikely(ret)) goto out_handle; /* Zero out partial block at the edges of the range */ ret = ext4_zero_partial_blocks(handle, inode, offset, len); if (ret >= 0) ext4_update_inode_fsync_trans(handle, inode, 1); if (file->f_flags & O_SYNC) ext4_handle_sync(handle); out_handle: ext4_journal_stop(handle); out_mutex: inode_unlock(inode); return ret; } /* * preallocate space for a file. This implements ext4's fallocate file * operation, which gets called from sys_fallocate system call. * For block-mapped files, posix_fallocate should fall back to the method * of writing zeroes to the required new blocks (the same behavior which is * expected for file systems which do not support fallocate() system call). */ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); loff_t new_size = 0; unsigned int max_blocks; int ret = 0; int flags; ext4_lblk_t lblk; unsigned int blkbits = inode->i_blkbits; /* * Encrypted inodes can't handle collapse range or insert * range since we would need to re-encrypt blocks with a * different IV or XTS tweak (which are based on the logical * block number). */ if (IS_ENCRYPTED(inode) && (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE))) return -EOPNOTSUPP; /* Return error if mode is not supported */ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE)) return -EOPNOTSUPP; inode_lock(inode); ret = ext4_convert_inline_data(inode); inode_unlock(inode); if (ret) goto exit; if (mode & FALLOC_FL_PUNCH_HOLE) { ret = ext4_punch_hole(file, offset, len); goto exit; } if (mode & FALLOC_FL_COLLAPSE_RANGE) { ret = ext4_collapse_range(file, offset, len); goto exit; } if (mode & FALLOC_FL_INSERT_RANGE) { ret = ext4_insert_range(file, offset, len); goto exit; } if (mode & FALLOC_FL_ZERO_RANGE) { ret = ext4_zero_range(file, offset, len, mode); goto exit; } trace_ext4_fallocate_enter(inode, offset, len, mode); lblk = offset >> blkbits; max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits); flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; inode_lock(inode); /* * We only support preallocation for extent-based files only */ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { ret = -EOPNOTSUPP; goto out; } if (!(mode & FALLOC_FL_KEEP_SIZE) && (offset + len > inode->i_size || offset + len > EXT4_I(inode)->i_disksize)) { new_size = offset + len; ret = inode_newsize_ok(inode, new_size); if (ret) goto out; } /* Wait all existing dio workers, newcomers will block on i_rwsem */ inode_dio_wait(inode); ret = file_modified(file); if (ret) goto out; ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags); if (ret) goto out; if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) { ret = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal, EXT4_I(inode)->i_sync_tid); } out: inode_unlock(inode); trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); exit: return ret; } /* * This function convert a range of blocks to written extents * The caller of this function will pass the start offset and the size. * all unwritten extents within this range will be converted to * written extents. * * This function is called from the direct IO end io call back * function, to convert the fallocated extents after IO is completed. * Returns 0 on success. */ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, loff_t offset, ssize_t len) { unsigned int max_blocks; int ret = 0, ret2 = 0, ret3 = 0; struct ext4_map_blocks map; unsigned int blkbits = inode->i_blkbits; unsigned int credits = 0; map.m_lblk = offset >> blkbits; max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits); if (!handle) { /* * credits to insert 1 extent into extent tree */ credits = ext4_chunk_trans_blocks(inode, max_blocks); } while (ret >= 0 && ret < max_blocks) { map.m_lblk += ret; map.m_len = (max_blocks -= ret); if (credits) { handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); break; } } ret = ext4_map_blocks(handle, inode, &map, EXT4_GET_BLOCKS_IO_CONVERT_EXT); if (ret <= 0) ext4_warning(inode->i_sb, "inode #%lu: block %u: len %u: " "ext4_ext_map_blocks returned %d", inode->i_ino, map.m_lblk, map.m_len, ret); ret2 = ext4_mark_inode_dirty(handle, inode); if (credits) { ret3 = ext4_journal_stop(handle); if (unlikely(ret3)) ret2 = ret3; } if (ret <= 0 || ret2) break; } return ret > 0 ? ret2 : ret; } int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end) { int ret = 0, err = 0; struct ext4_io_end_vec *io_end_vec; /* * This is somewhat ugly but the idea is clear: When transaction is * reserved, everything goes into it. Otherwise we rather start several * smaller transactions for conversion of each extent separately. */ if (handle) { handle = ext4_journal_start_reserved(handle, EXT4_HT_EXT_CONVERT); if (IS_ERR(handle)) return PTR_ERR(handle); } list_for_each_entry(io_end_vec, &io_end->list_vec, list) { ret = ext4_convert_unwritten_extents(handle, io_end->inode, io_end_vec->offset, io_end_vec->size); if (ret) break; } if (handle) err = ext4_journal_stop(handle); return ret < 0 ? ret : err; } static int ext4_iomap_xattr_fiemap(struct inode *inode, struct iomap *iomap) { __u64 physical = 0; __u64 length = 0; int blockbits = inode->i_sb->s_blocksize_bits; int error = 0; u16 iomap_type; /* in-inode? */ if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { struct ext4_iloc iloc; int offset; /* offset of xattr in inode */ error = ext4_get_inode_loc(inode, &iloc); if (error) return error; physical = (__u64)iloc.bh->b_blocknr << blockbits; offset = EXT4_GOOD_OLD_INODE_SIZE + EXT4_I(inode)->i_extra_isize; physical += offset; length = EXT4_SB(inode->i_sb)->s_inode_size - offset; brelse(iloc.bh); iomap_type = IOMAP_INLINE; } else if (EXT4_I(inode)->i_file_acl) { /* external block */ physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits; length = inode->i_sb->s_blocksize; iomap_type = IOMAP_MAPPED; } else { /* no in-inode or external block for xattr, so return -ENOENT */ error = -ENOENT; goto out; } iomap->addr = physical; iomap->offset = 0; iomap->length = length; iomap->type = iomap_type; iomap->flags = 0; out: return error; } static int ext4_iomap_xattr_begin(struct inode *inode, loff_t offset, loff_t length, unsigned flags, struct iomap *iomap, struct iomap *srcmap) { int error; error = ext4_iomap_xattr_fiemap(inode, iomap); if (error == 0 && (offset >= iomap->length)) error = -ENOENT; return error; } static const struct iomap_ops ext4_iomap_xattr_ops = { .iomap_begin = ext4_iomap_xattr_begin, }; static int ext4_fiemap_check_ranges(struct inode *inode, u64 start, u64 *len) { u64 maxbytes; if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) maxbytes = inode->i_sb->s_maxbytes; else maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes; if (*len == 0) return -EINVAL; if (start > maxbytes) return -EFBIG; /* * Shrink request scope to what the fs can actually handle. */ if (*len > maxbytes || (maxbytes - *len) < start) *len = maxbytes - start; return 0; } int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { int error = 0; if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { error = ext4_ext_precache(inode); if (error) return error; fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE; } /* * For bitmap files the maximum size limit could be smaller than * s_maxbytes, so check len here manually instead of just relying on the * generic check. */ error = ext4_fiemap_check_ranges(inode, start, &len); if (error) return error; if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { fieinfo->fi_flags &= ~FIEMAP_FLAG_XATTR; return iomap_fiemap(inode, fieinfo, start, len, &ext4_iomap_xattr_ops); } return iomap_fiemap(inode, fieinfo, start, len, &ext4_iomap_report_ops); } int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len) { ext4_lblk_t start_blk, len_blks; __u64 last_blk; int error = 0; if (ext4_has_inline_data(inode)) { int has_inline; down_read(&EXT4_I(inode)->xattr_sem); has_inline = ext4_has_inline_data(inode); up_read(&EXT4_I(inode)->xattr_sem); if (has_inline) return 0; } if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { error = ext4_ext_precache(inode); if (error) return error; fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE; } error = fiemap_prep(inode, fieinfo, start, &len, 0); if (error) return error; error = ext4_fiemap_check_ranges(inode, start, &len); if (error) return error; start_blk = start >> inode->i_sb->s_blocksize_bits; last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits; if (last_blk >= EXT_MAX_BLOCKS) last_blk = EXT_MAX_BLOCKS-1; len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1; /* * Walk the extent tree gathering extent information * and pushing extents back to the user. */ return ext4_fill_es_cache_info(inode, start_blk, len_blks, fieinfo); } /* * ext4_ext_shift_path_extents: * Shift the extents of a path structure lying between path[depth].p_ext * and EXT_LAST_EXTENT(path[depth].p_hdr), by @shift blocks. @SHIFT tells * if it is right shift or left shift operation. */ static int ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, struct inode *inode, handle_t *handle, enum SHIFT_DIRECTION SHIFT) { int depth, err = 0; struct ext4_extent *ex_start, *ex_last; bool update = false; int credits, restart_credits; depth = path->p_depth; while (depth >= 0) { if (depth == path->p_depth) { ex_start = path[depth].p_ext; if (!ex_start) return -EFSCORRUPTED; ex_last = EXT_LAST_EXTENT(path[depth].p_hdr); /* leaf + sb + inode */ credits = 3; if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr)) { update = true; /* extent tree + sb + inode */ credits = depth + 2; } restart_credits = ext4_writepage_trans_blocks(inode); err = ext4_datasem_ensure_credits(handle, inode, credits, restart_credits, 0); if (err) { if (err > 0) err = -EAGAIN; goto out; } err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out; while (ex_start <= ex_last) { if (SHIFT == SHIFT_LEFT) { le32_add_cpu(&ex_start->ee_block, -shift); /* Try to merge to the left. */ if ((ex_start > EXT_FIRST_EXTENT(path[depth].p_hdr)) && ext4_ext_try_to_merge_right(inode, path, ex_start - 1)) ex_last--; else ex_start++; } else { le32_add_cpu(&ex_last->ee_block, shift); ext4_ext_try_to_merge_right(inode, path, ex_last); ex_last--; } } err = ext4_ext_dirty(handle, inode, path + depth); if (err) goto out; if (--depth < 0 || !update) break; } /* Update index too */ err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out; if (SHIFT == SHIFT_LEFT) le32_add_cpu(&path[depth].p_idx->ei_block, -shift); else le32_add_cpu(&path[depth].p_idx->ei_block, shift); err = ext4_ext_dirty(handle, inode, path + depth); if (err) goto out; /* we are done if current index is not a starting index */ if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr)) break; depth--; } out: return err; } /* * ext4_ext_shift_extents: * All the extents which lies in the range from @start to the last allocated * block for the @inode are shifted either towards left or right (depending * upon @SHIFT) by @shift blocks. * On success, 0 is returned, error otherwise. */ static int ext4_ext_shift_extents(struct inode *inode, handle_t *handle, ext4_lblk_t start, ext4_lblk_t shift, enum SHIFT_DIRECTION SHIFT) { struct ext4_ext_path *path; int ret = 0, depth; struct ext4_extent *extent; ext4_lblk_t stop, *iterator, ex_start, ex_end; ext4_lblk_t tmp = EXT_MAX_BLOCKS; /* Let path point to the last extent */ path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, EXT4_EX_NOCACHE); if (IS_ERR(path)) return PTR_ERR(path); depth = path->p_depth; extent = path[depth].p_ext; if (!extent) goto out; stop = le32_to_cpu(extent->ee_block); /* * For left shifts, make sure the hole on the left is big enough to * accommodate the shift. For right shifts, make sure the last extent * won't be shifted beyond EXT_MAX_BLOCKS. */ if (SHIFT == SHIFT_LEFT) { path = ext4_find_extent(inode, start - 1, &path, EXT4_EX_NOCACHE); if (IS_ERR(path)) return PTR_ERR(path); depth = path->p_depth; extent = path[depth].p_ext; if (extent) { ex_start = le32_to_cpu(extent->ee_block); ex_end = le32_to_cpu(extent->ee_block) + ext4_ext_get_actual_len(extent); } else { ex_start = 0; ex_end = 0; } if ((start == ex_start && shift > ex_start) || (shift > start - ex_end)) { ret = -EINVAL; goto out; } } else { if (shift > EXT_MAX_BLOCKS - (stop + ext4_ext_get_actual_len(extent))) { ret = -EINVAL; goto out; } } /* * In case of left shift, iterator points to start and it is increased * till we reach stop. In case of right shift, iterator points to stop * and it is decreased till we reach start. */ again: ret = 0; if (SHIFT == SHIFT_LEFT) iterator = &start; else iterator = &stop; if (tmp != EXT_MAX_BLOCKS) *iterator = tmp; /* * Its safe to start updating extents. Start and stop are unsigned, so * in case of right shift if extent with 0 block is reached, iterator * becomes NULL to indicate the end of the loop. */ while (iterator && start <= stop) { path = ext4_find_extent(inode, *iterator, &path, EXT4_EX_NOCACHE); if (IS_ERR(path)) return PTR_ERR(path); depth = path->p_depth; extent = path[depth].p_ext; if (!extent) { EXT4_ERROR_INODE(inode, "unexpected hole at %lu", (unsigned long) *iterator); return -EFSCORRUPTED; } if (SHIFT == SHIFT_LEFT && *iterator > le32_to_cpu(extent->ee_block)) { /* Hole, move to the next extent */ if (extent < EXT_LAST_EXTENT(path[depth].p_hdr)) { path[depth].p_ext++; } else { *iterator = ext4_ext_next_allocated_block(path); continue; } } tmp = *iterator; if (SHIFT == SHIFT_LEFT) { extent = EXT_LAST_EXTENT(path[depth].p_hdr); *iterator = le32_to_cpu(extent->ee_block) + ext4_ext_get_actual_len(extent); } else { extent = EXT_FIRST_EXTENT(path[depth].p_hdr); if (le32_to_cpu(extent->ee_block) > start) *iterator = le32_to_cpu(extent->ee_block) - 1; else if (le32_to_cpu(extent->ee_block) == start) iterator = NULL; else { extent = EXT_LAST_EXTENT(path[depth].p_hdr); while (le32_to_cpu(extent->ee_block) >= start) extent--; if (extent == EXT_LAST_EXTENT(path[depth].p_hdr)) break; extent++; iterator = NULL; } path[depth].p_ext = extent; } ret = ext4_ext_shift_path_extents(path, shift, inode, handle, SHIFT); /* iterator can be NULL which means we should break */ if (ret == -EAGAIN) goto again; if (ret) break; } out: ext4_free_ext_path(path); return ret; } /* * ext4_collapse_range: * This implements the fallocate's collapse range functionality for ext4 * Returns: 0 and non-zero on error. */ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; struct address_space *mapping = inode->i_mapping; ext4_lblk_t punch_start, punch_stop; handle_t *handle; unsigned int credits; loff_t new_size, ioffset; int ret; /* * We need to test this early because xfstests assumes that a * collapse range of (0, 1) will return EOPNOTSUPP if the file * system does not support collapse range. */ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return -EOPNOTSUPP; /* Collapse range works only on fs cluster size aligned regions. */ if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb))) return -EINVAL; trace_ext4_collapse_range(inode, offset, len); punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb); punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb); inode_lock(inode); /* * There is no need to overlap collapse range with EOF, in which case * it is effectively a truncate operation */ if (offset + len >= inode->i_size) { ret = -EINVAL; goto out_mutex; } /* Currently just for extent based files */ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { ret = -EOPNOTSUPP; goto out_mutex; } /* Wait for existing dio to complete */ inode_dio_wait(inode); ret = file_modified(file); if (ret) goto out_mutex; /* * Prevent page faults from reinstantiating pages we have released from * page cache. */ filemap_invalidate_lock(mapping); ret = ext4_break_layouts(inode); if (ret) goto out_mmap; /* * Need to round down offset to be aligned with page size boundary * for page size > block size. */ ioffset = round_down(offset, PAGE_SIZE); /* * Write tail of the last page before removed range since it will get * removed from the page cache below. */ ret = filemap_write_and_wait_range(mapping, ioffset, offset); if (ret) goto out_mmap; /* * Write data that will be shifted to preserve them when discarding * page cache below. We are also protected from pages becoming dirty * by i_rwsem and invalidate_lock. */ ret = filemap_write_and_wait_range(mapping, offset + len, LLONG_MAX); if (ret) goto out_mmap; truncate_pagecache(inode, ioffset); credits = ext4_writepage_trans_blocks(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out_mmap; } ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle); down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); ext4_es_remove_extent(inode, punch_start, EXT_MAX_BLOCKS - punch_start); ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1); if (ret) { up_write(&EXT4_I(inode)->i_data_sem); goto out_stop; } ext4_discard_preallocations(inode); ret = ext4_ext_shift_extents(inode, handle, punch_stop, punch_stop - punch_start, SHIFT_LEFT); if (ret) { up_write(&EXT4_I(inode)->i_data_sem); goto out_stop; } new_size = inode->i_size - len; i_size_write(inode, new_size); EXT4_I(inode)->i_disksize = new_size; up_write(&EXT4_I(inode)->i_data_sem); if (IS_SYNC(inode)) ext4_handle_sync(handle); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); ret = ext4_mark_inode_dirty(handle, inode); ext4_update_inode_fsync_trans(handle, inode, 1); out_stop: ext4_journal_stop(handle); out_mmap: filemap_invalidate_unlock(mapping); out_mutex: inode_unlock(inode); return ret; } /* * ext4_insert_range: * This function implements the FALLOC_FL_INSERT_RANGE flag of fallocate. * The data blocks starting from @offset to the EOF are shifted by @len * towards right to create a hole in the @inode. Inode size is increased * by len bytes. * Returns 0 on success, error otherwise. */ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; struct address_space *mapping = inode->i_mapping; handle_t *handle; struct ext4_ext_path *path; struct ext4_extent *extent; ext4_lblk_t offset_lblk, len_lblk, ee_start_lblk = 0; unsigned int credits, ee_len; int ret = 0, depth, split_flag = 0; loff_t ioffset; /* * We need to test this early because xfstests assumes that an * insert range of (0, 1) will return EOPNOTSUPP if the file * system does not support insert range. */ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return -EOPNOTSUPP; /* Insert range works only on fs cluster size aligned regions. */ if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb))) return -EINVAL; trace_ext4_insert_range(inode, offset, len); offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb); len_lblk = len >> EXT4_BLOCK_SIZE_BITS(sb); inode_lock(inode); /* Currently just for extent based files */ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { ret = -EOPNOTSUPP; goto out_mutex; } /* Check whether the maximum file size would be exceeded */ if (len > inode->i_sb->s_maxbytes - inode->i_size) { ret = -EFBIG; goto out_mutex; } /* Offset must be less than i_size */ if (offset >= inode->i_size) { ret = -EINVAL; goto out_mutex; } /* Wait for existing dio to complete */ inode_dio_wait(inode); ret = file_modified(file); if (ret) goto out_mutex; /* * Prevent page faults from reinstantiating pages we have released from * page cache. */ filemap_invalidate_lock(mapping); ret = ext4_break_layouts(inode); if (ret) goto out_mmap; /* * Need to round down to align start offset to page size boundary * for page size > block size. */ ioffset = round_down(offset, PAGE_SIZE); /* Write out all dirty pages */ ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, LLONG_MAX); if (ret) goto out_mmap; truncate_pagecache(inode, ioffset); credits = ext4_writepage_trans_blocks(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out_mmap; } ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle); /* Expand file to avoid data loss if there is error while shifting */ inode->i_size += len; EXT4_I(inode)->i_disksize += len; inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); ret = ext4_mark_inode_dirty(handle, inode); if (ret) goto out_stop; down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); path = ext4_find_extent(inode, offset_lblk, NULL, 0); if (IS_ERR(path)) { up_write(&EXT4_I(inode)->i_data_sem); goto out_stop; } depth = ext_depth(inode); extent = path[depth].p_ext; if (extent) { ee_start_lblk = le32_to_cpu(extent->ee_block); ee_len = ext4_ext_get_actual_len(extent); /* * If offset_lblk is not the starting block of extent, split * the extent @offset_lblk */ if ((offset_lblk > ee_start_lblk) && (offset_lblk < (ee_start_lblk + ee_len))) { if (ext4_ext_is_unwritten(extent)) split_flag = EXT4_EXT_MARK_UNWRIT1 | EXT4_EXT_MARK_UNWRIT2; ret = ext4_split_extent_at(handle, inode, &path, offset_lblk, split_flag, EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_METADATA_NOFAIL); } ext4_free_ext_path(path); if (ret < 0) { up_write(&EXT4_I(inode)->i_data_sem); goto out_stop; } } else { ext4_free_ext_path(path); } ext4_es_remove_extent(inode, offset_lblk, EXT_MAX_BLOCKS - offset_lblk); /* * if offset_lblk lies in a hole which is at start of file, use * ee_start_lblk to shift extents */ ret = ext4_ext_shift_extents(inode, handle, max(ee_start_lblk, offset_lblk), len_lblk, SHIFT_RIGHT); up_write(&EXT4_I(inode)->i_data_sem); if (IS_SYNC(inode)) ext4_handle_sync(handle); if (ret >= 0) ext4_update_inode_fsync_trans(handle, inode, 1); out_stop: ext4_journal_stop(handle); out_mmap: filemap_invalidate_unlock(mapping); out_mutex: inode_unlock(inode); return ret; } /** * ext4_swap_extents() - Swap extents between two inodes * @handle: handle for this transaction * @inode1: First inode * @inode2: Second inode * @lblk1: Start block for first inode * @lblk2: Start block for second inode * @count: Number of blocks to swap * @unwritten: Mark second inode's extents as unwritten after swap * @erp: Pointer to save error value * * This helper routine does exactly what is promise "swap extents". All other * stuff such as page-cache locking consistency, bh mapping consistency or * extent's data copying must be performed by caller. * Locking: * i_rwsem is held for both inodes * i_data_sem is locked for write for both inodes * Assumptions: * All pages from requested range are locked for both inodes */ int ext4_swap_extents(handle_t *handle, struct inode *inode1, struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2, ext4_lblk_t count, int unwritten, int *erp) { struct ext4_ext_path *path1 = NULL; struct ext4_ext_path *path2 = NULL; int replaced_count = 0; BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem)); BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem)); BUG_ON(!inode_is_locked(inode1)); BUG_ON(!inode_is_locked(inode2)); ext4_es_remove_extent(inode1, lblk1, count); ext4_es_remove_extent(inode2, lblk2, count); while (count) { struct ext4_extent *ex1, *ex2, tmp_ex; ext4_lblk_t e1_blk, e2_blk; int e1_len, e2_len, len; int split = 0; path1 = ext4_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE); if (IS_ERR(path1)) { *erp = PTR_ERR(path1); path1 = NULL; finish: count = 0; goto repeat; } path2 = ext4_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE); if (IS_ERR(path2)) { *erp = PTR_ERR(path2); path2 = NULL; goto finish; } ex1 = path1[path1->p_depth].p_ext; ex2 = path2[path2->p_depth].p_ext; /* Do we have something to swap ? */ if (unlikely(!ex2 || !ex1)) goto finish; e1_blk = le32_to_cpu(ex1->ee_block); e2_blk = le32_to_cpu(ex2->ee_block); e1_len = ext4_ext_get_actual_len(ex1); e2_len = ext4_ext_get_actual_len(ex2); /* Hole handling */ if (!in_range(lblk1, e1_blk, e1_len) || !in_range(lblk2, e2_blk, e2_len)) { ext4_lblk_t next1, next2; /* if hole after extent, then go to next extent */ next1 = ext4_ext_next_allocated_block(path1); next2 = ext4_ext_next_allocated_block(path2); /* If hole before extent, then shift to that extent */ if (e1_blk > lblk1) next1 = e1_blk; if (e2_blk > lblk2) next2 = e2_blk; /* Do we have something to swap */ if (next1 == EXT_MAX_BLOCKS || next2 == EXT_MAX_BLOCKS) goto finish; /* Move to the rightest boundary */ len = next1 - lblk1; if (len < next2 - lblk2) len = next2 - lblk2; if (len > count) len = count; lblk1 += len; lblk2 += len; count -= len; goto repeat; } /* Prepare left boundary */ if (e1_blk < lblk1) { split = 1; *erp = ext4_force_split_extent_at(handle, inode1, &path1, lblk1, 0); if (unlikely(*erp)) goto finish; } if (e2_blk < lblk2) { split = 1; *erp = ext4_force_split_extent_at(handle, inode2, &path2, lblk2, 0); if (unlikely(*erp)) goto finish; } /* ext4_split_extent_at() may result in leaf extent split, * path must to be revalidated. */ if (split) goto repeat; /* Prepare right boundary */ len = count; if (len > e1_blk + e1_len - lblk1) len = e1_blk + e1_len - lblk1; if (len > e2_blk + e2_len - lblk2) len = e2_blk + e2_len - lblk2; if (len != e1_len) { split = 1; *erp = ext4_force_split_extent_at(handle, inode1, &path1, lblk1 + len, 0); if (unlikely(*erp)) goto finish; } if (len != e2_len) { split = 1; *erp = ext4_force_split_extent_at(handle, inode2, &path2, lblk2 + len, 0); if (*erp) goto finish; } /* ext4_split_extent_at() may result in leaf extent split, * path must to be revalidated. */ if (split) goto repeat; BUG_ON(e2_len != e1_len); *erp = ext4_ext_get_access(handle, inode1, path1 + path1->p_depth); if (unlikely(*erp)) goto finish; *erp = ext4_ext_get_access(handle, inode2, path2 + path2->p_depth); if (unlikely(*erp)) goto finish; /* Both extents are fully inside boundaries. Swap it now */ tmp_ex = *ex1; ext4_ext_store_pblock(ex1, ext4_ext_pblock(ex2)); ext4_ext_store_pblock(ex2, ext4_ext_pblock(&tmp_ex)); ex1->ee_len = cpu_to_le16(e2_len); ex2->ee_len = cpu_to_le16(e1_len); if (unwritten) ext4_ext_mark_unwritten(ex2); if (ext4_ext_is_unwritten(&tmp_ex)) ext4_ext_mark_unwritten(ex1); ext4_ext_try_to_merge(handle, inode2, path2, ex2); ext4_ext_try_to_merge(handle, inode1, path1, ex1); *erp = ext4_ext_dirty(handle, inode2, path2 + path2->p_depth); if (unlikely(*erp)) goto finish; *erp = ext4_ext_dirty(handle, inode1, path1 + path1->p_depth); /* * Looks scarry ah..? second inode already points to new blocks, * and it was successfully dirtied. But luckily error may happen * only due to journal error, so full transaction will be * aborted anyway. */ if (unlikely(*erp)) goto finish; lblk1 += len; lblk2 += len; replaced_count += len; count -= len; repeat: ext4_free_ext_path(path1); ext4_free_ext_path(path2); path1 = path2 = NULL; } return replaced_count; } /* * ext4_clu_mapped - determine whether any block in a logical cluster has * been mapped to a physical cluster * * @inode - file containing the logical cluster * @lclu - logical cluster of interest * * Returns 1 if any block in the logical cluster is mapped, signifying * that a physical cluster has been allocated for it. Otherwise, * returns 0. Can also return negative error codes. Derived from * ext4_ext_map_blocks(). */ int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_ext_path *path; int depth, mapped = 0, err = 0; struct ext4_extent *extent; ext4_lblk_t first_lblk, first_lclu, last_lclu; /* * if data can be stored inline, the logical cluster isn't * mapped - no physical clusters have been allocated, and the * file has no extents */ if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) || ext4_has_inline_data(inode)) return 0; /* search for the extent closest to the first block in the cluster */ path = ext4_find_extent(inode, EXT4_C2B(sbi, lclu), NULL, 0); if (IS_ERR(path)) { err = PTR_ERR(path); path = NULL; goto out; } depth = ext_depth(inode); /* * A consistent leaf must not be empty. This situation is possible, * though, _during_ tree modification, and it's why an assert can't * be put in ext4_find_extent(). */ if (unlikely(path[depth].p_ext == NULL && depth != 0)) { EXT4_ERROR_INODE(inode, "bad extent address - lblock: %lu, depth: %d, pblock: %lld", (unsigned long) EXT4_C2B(sbi, lclu), depth, path[depth].p_block); err = -EFSCORRUPTED; goto out; } extent = path[depth].p_ext; /* can't be mapped if the extent tree is empty */ if (extent == NULL) goto out; first_lblk = le32_to_cpu(extent->ee_block); first_lclu = EXT4_B2C(sbi, first_lblk); /* * Three possible outcomes at this point - found extent spanning * the target cluster, to the left of the target cluster, or to the * right of the target cluster. The first two cases are handled here. * The last case indicates the target cluster is not mapped. */ if (lclu >= first_lclu) { last_lclu = EXT4_B2C(sbi, first_lblk + ext4_ext_get_actual_len(extent) - 1); if (lclu <= last_lclu) { mapped = 1; } else { first_lblk = ext4_ext_next_allocated_block(path); first_lclu = EXT4_B2C(sbi, first_lblk); if (lclu == first_lclu) mapped = 1; } } out: ext4_free_ext_path(path); return err ? err : mapped; } /* * Updates physical block address and unwritten status of extent * starting at lblk start and of len. If such an extent doesn't exist, * this function splits the extent tree appropriately to create an * extent like this. This function is called in the fast commit * replay path. Returns 0 on success and error on failure. */ int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start, int len, int unwritten, ext4_fsblk_t pblk) { struct ext4_ext_path *path = NULL, *ppath; struct ext4_extent *ex; int ret; path = ext4_find_extent(inode, start, NULL, 0); if (IS_ERR(path)) return PTR_ERR(path); ex = path[path->p_depth].p_ext; if (!ex) { ret = -EFSCORRUPTED; goto out; } if (le32_to_cpu(ex->ee_block) != start || ext4_ext_get_actual_len(ex) != len) { /* We need to split this extent to match our extent first */ ppath = path; down_write(&EXT4_I(inode)->i_data_sem); ret = ext4_force_split_extent_at(NULL, inode, &ppath, start, 1); up_write(&EXT4_I(inode)->i_data_sem); if (ret) goto out; kfree(path); path = ext4_find_extent(inode, start, NULL, 0); if (IS_ERR(path)) return -1; ppath = path; ex = path[path->p_depth].p_ext; WARN_ON(le32_to_cpu(ex->ee_block) != start); if (ext4_ext_get_actual_len(ex) != len) { down_write(&EXT4_I(inode)->i_data_sem); ret = ext4_force_split_extent_at(NULL, inode, &ppath, start + len, 1); up_write(&EXT4_I(inode)->i_data_sem); if (ret) goto out; kfree(path); path = ext4_find_extent(inode, start, NULL, 0); if (IS_ERR(path)) return -EINVAL; ex = path[path->p_depth].p_ext; } } if (unwritten) ext4_ext_mark_unwritten(ex); else ext4_ext_mark_initialized(ex); ext4_ext_store_pblock(ex, pblk); down_write(&EXT4_I(inode)->i_data_sem); ret = ext4_ext_dirty(NULL, inode, &path[path->p_depth]); up_write(&EXT4_I(inode)->i_data_sem); out: ext4_free_ext_path(path); ext4_mark_inode_dirty(NULL, inode); return ret; } /* Try to shrink the extent tree */ void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end) { struct ext4_ext_path *path = NULL; struct ext4_extent *ex; ext4_lblk_t old_cur, cur = 0; while (cur < end) { path = ext4_find_extent(inode, cur, NULL, 0); if (IS_ERR(path)) return; ex = path[path->p_depth].p_ext; if (!ex) { ext4_free_ext_path(path); ext4_mark_inode_dirty(NULL, inode); return; } old_cur = cur; cur = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex); if (cur <= old_cur) cur = old_cur + 1; ext4_ext_try_to_merge(NULL, inode, path, ex); down_write(&EXT4_I(inode)->i_data_sem); ext4_ext_dirty(NULL, inode, &path[path->p_depth]); up_write(&EXT4_I(inode)->i_data_sem); ext4_mark_inode_dirty(NULL, inode); ext4_free_ext_path(path); } } /* Check if *cur is a hole and if it is, skip it */ static int skip_hole(struct inode *inode, ext4_lblk_t *cur) { int ret; struct ext4_map_blocks map; map.m_lblk = *cur; map.m_len = ((inode->i_size) >> inode->i_sb->s_blocksize_bits) - *cur; ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret < 0) return ret; if (ret != 0) return 0; *cur = *cur + map.m_len; return 0; } /* Count number of blocks used by this inode and update i_blocks */ int ext4_ext_replay_set_iblocks(struct inode *inode) { struct ext4_ext_path *path = NULL, *path2 = NULL; struct ext4_extent *ex; ext4_lblk_t cur = 0, end; int numblks = 0, i, ret = 0; ext4_fsblk_t cmp1, cmp2; struct ext4_map_blocks map; /* Determin the size of the file first */ path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, EXT4_EX_NOCACHE); if (IS_ERR(path)) return PTR_ERR(path); ex = path[path->p_depth].p_ext; if (!ex) { ext4_free_ext_path(path); goto out; } end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex); ext4_free_ext_path(path); /* Count the number of data blocks */ cur = 0; while (cur < end) { map.m_lblk = cur; map.m_len = end - cur; ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret < 0) break; if (ret > 0) numblks += ret; cur = cur + map.m_len; } /* * Count the number of extent tree blocks. We do it by looking up * two successive extents and determining the difference between * their paths. When path is different for 2 successive extents * we compare the blocks in the path at each level and increment * iblocks by total number of differences found. */ cur = 0; ret = skip_hole(inode, &cur); if (ret < 0) goto out; path = ext4_find_extent(inode, cur, NULL, 0); if (IS_ERR(path)) goto out; numblks += path->p_depth; ext4_free_ext_path(path); while (cur < end) { path = ext4_find_extent(inode, cur, NULL, 0); if (IS_ERR(path)) break; ex = path[path->p_depth].p_ext; if (!ex) { ext4_free_ext_path(path); return 0; } cur = max(cur + 1, le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex)); ret = skip_hole(inode, &cur); if (ret < 0) { ext4_free_ext_path(path); break; } path2 = ext4_find_extent(inode, cur, NULL, 0); if (IS_ERR(path2)) { ext4_free_ext_path(path); break; } for (i = 0; i <= max(path->p_depth, path2->p_depth); i++) { cmp1 = cmp2 = 0; if (i <= path->p_depth) cmp1 = path[i].p_bh ? path[i].p_bh->b_blocknr : 0; if (i <= path2->p_depth) cmp2 = path2[i].p_bh ? path2[i].p_bh->b_blocknr : 0; if (cmp1 != cmp2 && cmp2 != 0) numblks++; } ext4_free_ext_path(path); ext4_free_ext_path(path2); } out: inode->i_blocks = numblks << (inode->i_sb->s_blocksize_bits - 9); ext4_mark_inode_dirty(NULL, inode); return 0; } int ext4_ext_clear_bb(struct inode *inode) { struct ext4_ext_path *path = NULL; struct ext4_extent *ex; ext4_lblk_t cur = 0, end; int j, ret = 0; struct ext4_map_blocks map; if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) return 0; /* Determin the size of the file first */ path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, EXT4_EX_NOCACHE); if (IS_ERR(path)) return PTR_ERR(path); ex = path[path->p_depth].p_ext; if (!ex) { ext4_free_ext_path(path); return 0; } end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex); ext4_free_ext_path(path); cur = 0; while (cur < end) { map.m_lblk = cur; map.m_len = end - cur; ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret < 0) break; if (ret > 0) { path = ext4_find_extent(inode, map.m_lblk, NULL, 0); if (!IS_ERR_OR_NULL(path)) { for (j = 0; j < path->p_depth; j++) { ext4_mb_mark_bb(inode->i_sb, path[j].p_block, 1, false); ext4_fc_record_regions(inode->i_sb, inode->i_ino, 0, path[j].p_block, 1, 1); } ext4_free_ext_path(path); } ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false); ext4_fc_record_regions(inode->i_sb, inode->i_ino, map.m_lblk, map.m_pblk, map.m_len, 1); } cur = cur + map.m_len; } return 0; } |
704 705 742 10 10 696 743 744 745 742 777 778 777 226 743 709 697 43 728 778 745 47 693 778 728 728 728 46 728 727 728 730 730 730 730 471 729 37 11 1 2 10 12 728 728 30 23 726 2 2 2 19 728 728 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/page-io.c * * This contains the new page_io functions for ext4 * * Written by Theodore Ts'o, 2010. */ #include <linux/fs.h> #include <linux/time.h> #include <linux/highuid.h> #include <linux/pagemap.h> #include <linux/quotaops.h> #include <linux/string.h> #include <linux/buffer_head.h> #include <linux/writeback.h> #include <linux/pagevec.h> #include <linux/mpage.h> #include <linux/namei.h> #include <linux/uio.h> #include <linux/bio.h> #include <linux/workqueue.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/mm.h> #include <linux/sched/mm.h> #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" static struct kmem_cache *io_end_cachep; static struct kmem_cache *io_end_vec_cachep; int __init ext4_init_pageio(void) { io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT); if (io_end_cachep == NULL) return -ENOMEM; io_end_vec_cachep = KMEM_CACHE(ext4_io_end_vec, 0); if (io_end_vec_cachep == NULL) { kmem_cache_destroy(io_end_cachep); return -ENOMEM; } return 0; } void ext4_exit_pageio(void) { kmem_cache_destroy(io_end_cachep); kmem_cache_destroy(io_end_vec_cachep); } struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end) { struct ext4_io_end_vec *io_end_vec; io_end_vec = kmem_cache_zalloc(io_end_vec_cachep, GFP_NOFS); if (!io_end_vec) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&io_end_vec->list); list_add_tail(&io_end_vec->list, &io_end->list_vec); return io_end_vec; } static void ext4_free_io_end_vec(ext4_io_end_t *io_end) { struct ext4_io_end_vec *io_end_vec, *tmp; if (list_empty(&io_end->list_vec)) return; list_for_each_entry_safe(io_end_vec, tmp, &io_end->list_vec, list) { list_del(&io_end_vec->list); kmem_cache_free(io_end_vec_cachep, io_end_vec); } } struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end) { BUG_ON(list_empty(&io_end->list_vec)); return list_last_entry(&io_end->list_vec, struct ext4_io_end_vec, list); } /* * Print an buffer I/O error compatible with the fs/buffer.c. This * provides compatibility with dmesg scrapers that look for a specific * buffer I/O error message. We really need a unified error reporting * structure to userspace ala Digital Unix's uerf system, but it's * probably not going to happen in my lifetime, due to LKML politics... */ static void buffer_io_error(struct buffer_head *bh) { printk_ratelimited(KERN_ERR "Buffer I/O error on device %pg, logical block %llu\n", bh->b_bdev, (unsigned long long)bh->b_blocknr); } static void ext4_finish_bio(struct bio *bio) { struct folio_iter fi; bio_for_each_folio_all(fi, bio) { struct folio *folio = fi.folio; struct folio *io_folio = NULL; struct buffer_head *bh, *head; size_t bio_start = fi.offset; size_t bio_end = bio_start + fi.length; unsigned under_io = 0; unsigned long flags; if (fscrypt_is_bounce_folio(folio)) { io_folio = folio; folio = fscrypt_pagecache_folio(folio); } if (bio->bi_status) { int err = blk_status_to_errno(bio->bi_status); folio_set_error(folio); mapping_set_error(folio->mapping, err); } bh = head = folio_buffers(folio); /* * We check all buffers in the folio under b_uptodate_lock * to avoid races with other end io clearing async_write flags */ spin_lock_irqsave(&head->b_uptodate_lock, flags); do { if (bh_offset(bh) < bio_start || bh_offset(bh) + bh->b_size > bio_end) { if (buffer_async_write(bh)) under_io++; continue; } clear_buffer_async_write(bh); if (bio->bi_status) { set_buffer_write_io_error(bh); buffer_io_error(bh); } } while ((bh = bh->b_this_page) != head); spin_unlock_irqrestore(&head->b_uptodate_lock, flags); if (!under_io) { fscrypt_free_bounce_page(&io_folio->page); folio_end_writeback(folio); } } } static void ext4_release_io_end(ext4_io_end_t *io_end) { struct bio *bio, *next_bio; BUG_ON(!list_empty(&io_end->list)); BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN); WARN_ON(io_end->handle); for (bio = io_end->bio; bio; bio = next_bio) { next_bio = bio->bi_private; ext4_finish_bio(bio); bio_put(bio); } ext4_free_io_end_vec(io_end); kmem_cache_free(io_end_cachep, io_end); } /* * Check a range of space and convert unwritten extents to written. Note that * we are protected from truncate touching same part of extent tree by the * fact that truncate code waits for all DIO to finish (thus exclusion from * direct IO is achieved) and also waits for PageWriteback bits. Thus we * cannot get to ext4_ext_truncate() before all IOs overlapping that range are * completed (happens from ext4_free_ioend()). */ static int ext4_end_io_end(ext4_io_end_t *io_end) { struct inode *inode = io_end->inode; handle_t *handle = io_end->handle; int ret = 0; ext4_debug("ext4_end_io_nolock: io_end 0x%p from inode %lu,list->next 0x%p," "list->prev 0x%p\n", io_end, inode->i_ino, io_end->list.next, io_end->list.prev); io_end->handle = NULL; /* Following call will use up the handle */ ret = ext4_convert_unwritten_io_end_vec(handle, io_end); if (ret < 0 && !ext4_forced_shutdown(inode->i_sb)) { ext4_msg(inode->i_sb, KERN_EMERG, "failed to convert unwritten extents to written " "extents -- potential data loss! " "(inode %lu, error %d)", inode->i_ino, ret); } ext4_clear_io_unwritten_flag(io_end); ext4_release_io_end(io_end); return ret; } static void dump_completed_IO(struct inode *inode, struct list_head *head) { #ifdef EXT4FS_DEBUG struct list_head *cur, *before, *after; ext4_io_end_t *io_end, *io_end0, *io_end1; if (list_empty(head)) return; ext4_debug("Dump inode %lu completed io list\n", inode->i_ino); list_for_each_entry(io_end, head, list) { cur = &io_end->list; before = cur->prev; io_end0 = container_of(before, ext4_io_end_t, list); after = cur->next; io_end1 = container_of(after, ext4_io_end_t, list); ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", io_end, inode->i_ino, io_end0, io_end1); } #endif } /* Add the io_end to per-inode completed end_io list. */ static void ext4_add_complete_io(ext4_io_end_t *io_end) { struct ext4_inode_info *ei = EXT4_I(io_end->inode); struct ext4_sb_info *sbi = EXT4_SB(io_end->inode->i_sb); struct workqueue_struct *wq; unsigned long flags; /* Only reserved conversions from writeback should enter here */ WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); WARN_ON(!io_end->handle && sbi->s_journal); spin_lock_irqsave(&ei->i_completed_io_lock, flags); wq = sbi->rsv_conversion_wq; if (list_empty(&ei->i_rsv_conversion_list)) queue_work(wq, &ei->i_rsv_conversion_work); list_add_tail(&io_end->list, &ei->i_rsv_conversion_list); spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); } static int ext4_do_flush_completed_IO(struct inode *inode, struct list_head *head) { ext4_io_end_t *io_end; struct list_head unwritten; unsigned long flags; struct ext4_inode_info *ei = EXT4_I(inode); int err, ret = 0; spin_lock_irqsave(&ei->i_completed_io_lock, flags); dump_completed_IO(inode, head); list_replace_init(head, &unwritten); spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); while (!list_empty(&unwritten)) { io_end = list_entry(unwritten.next, ext4_io_end_t, list); BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); list_del_init(&io_end->list); err = ext4_end_io_end(io_end); if (unlikely(!ret && err)) ret = err; } return ret; } /* * work on completed IO, to convert unwritten extents to extents */ void ext4_end_io_rsv_work(struct work_struct *work) { struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info, i_rsv_conversion_work); ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_rsv_conversion_list); } ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) { ext4_io_end_t *io_end = kmem_cache_zalloc(io_end_cachep, flags); if (io_end) { io_end->inode = inode; INIT_LIST_HEAD(&io_end->list); INIT_LIST_HEAD(&io_end->list_vec); refcount_set(&io_end->count, 1); } return io_end; } void ext4_put_io_end_defer(ext4_io_end_t *io_end) { if (refcount_dec_and_test(&io_end->count)) { if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || list_empty(&io_end->list_vec)) { ext4_release_io_end(io_end); return; } ext4_add_complete_io(io_end); } } int ext4_put_io_end(ext4_io_end_t *io_end) { int err = 0; if (refcount_dec_and_test(&io_end->count)) { if (io_end->flag & EXT4_IO_END_UNWRITTEN) { err = ext4_convert_unwritten_io_end_vec(io_end->handle, io_end); io_end->handle = NULL; ext4_clear_io_unwritten_flag(io_end); } ext4_release_io_end(io_end); } return err; } ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end) { refcount_inc(&io_end->count); return io_end; } /* BIO completion function for page writeback */ static void ext4_end_bio(struct bio *bio) { ext4_io_end_t *io_end = bio->bi_private; sector_t bi_sector = bio->bi_iter.bi_sector; if (WARN_ONCE(!io_end, "io_end is NULL: %pg: sector %Lu len %u err %d\n", bio->bi_bdev, (long long) bio->bi_iter.bi_sector, (unsigned) bio_sectors(bio), bio->bi_status)) { ext4_finish_bio(bio); bio_put(bio); return; } bio->bi_end_io = NULL; if (bio->bi_status) { struct inode *inode = io_end->inode; ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu " "starting block %llu)", bio->bi_status, inode->i_ino, (unsigned long long) bi_sector >> (inode->i_blkbits - 9)); mapping_set_error(inode->i_mapping, blk_status_to_errno(bio->bi_status)); } if (io_end->flag & EXT4_IO_END_UNWRITTEN) { /* * Link bio into list hanging from io_end. We have to do it * atomically as bio completions can be racing against each * other. */ bio->bi_private = xchg(&io_end->bio, bio); ext4_put_io_end_defer(io_end); } else { /* * Drop io_end reference early. Inode can get freed once * we finish the bio. */ ext4_put_io_end_defer(io_end); ext4_finish_bio(bio); bio_put(bio); } } void ext4_io_submit(struct ext4_io_submit *io) { struct bio *bio = io->io_bio; if (bio) { if (io->io_wbc->sync_mode == WB_SYNC_ALL) io->io_bio->bi_opf |= REQ_SYNC; submit_bio(io->io_bio); } io->io_bio = NULL; } void ext4_io_submit_init(struct ext4_io_submit *io, struct writeback_control *wbc) { io->io_wbc = wbc; io->io_bio = NULL; io->io_end = NULL; } static void io_submit_init_bio(struct ext4_io_submit *io, struct buffer_head *bh) { struct bio *bio; /* * bio_alloc will _always_ be able to allocate a bio if * __GFP_DIRECT_RECLAIM is set, see comments for bio_alloc_bioset(). */ bio = bio_alloc(bh->b_bdev, BIO_MAX_VECS, REQ_OP_WRITE, GFP_NOIO); fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO); bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_end_io = ext4_end_bio; bio->bi_private = ext4_get_io_end(io->io_end); io->io_bio = bio; io->io_next_block = bh->b_blocknr; wbc_init_bio(io->io_wbc, bio); } static void io_submit_add_bh(struct ext4_io_submit *io, struct inode *inode, struct folio *folio, struct folio *io_folio, struct buffer_head *bh) { if (io->io_bio && (bh->b_blocknr != io->io_next_block || !fscrypt_mergeable_bio_bh(io->io_bio, bh))) { submit_and_retry: ext4_io_submit(io); } if (io->io_bio == NULL) io_submit_init_bio(io, bh); if (!bio_add_folio(io->io_bio, io_folio, bh->b_size, bh_offset(bh))) goto submit_and_retry; wbc_account_cgroup_owner(io->io_wbc, &folio->page, bh->b_size); io->io_next_block++; } int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio, size_t len) { struct folio *io_folio = folio; struct inode *inode = folio->mapping->host; unsigned block_start; struct buffer_head *bh, *head; int ret = 0; int nr_to_submit = 0; struct writeback_control *wbc = io->io_wbc; bool keep_towrite = false; BUG_ON(!folio_test_locked(folio)); BUG_ON(folio_test_writeback(folio)); folio_clear_error(folio); /* * Comments copied from block_write_full_folio: * * The folio straddles i_size. It must be zeroed out on each and every * writepage invocation because it may be mmapped. "A file is mapped * in multiples of the page size. For a file that is not a multiple of * the page size, the remaining memory is zeroed when mapped, and * writes to that region are not written out to the file." */ if (len < folio_size(folio)) folio_zero_segment(folio, len, folio_size(folio)); /* * In the first loop we prepare and mark buffers to submit. We have to * mark all buffers in the folio before submitting so that * folio_end_writeback() cannot be called from ext4_end_bio() when IO * on the first buffer finishes and we are still working on submitting * the second buffer. */ bh = head = folio_buffers(folio); do { block_start = bh_offset(bh); if (block_start >= len) { clear_buffer_dirty(bh); set_buffer_uptodate(bh); continue; } if (!buffer_dirty(bh) || buffer_delay(bh) || !buffer_mapped(bh) || buffer_unwritten(bh)) { /* A hole? We can safely clear the dirty bit */ if (!buffer_mapped(bh)) clear_buffer_dirty(bh); /* * Keeping dirty some buffer we cannot write? Make sure * to redirty the folio and keep TOWRITE tag so that * racing WB_SYNC_ALL writeback does not skip the folio. * This happens e.g. when doing writeout for * transaction commit or when journalled data is not * yet committed. */ if (buffer_dirty(bh) || (buffer_jbd(bh) && buffer_jbddirty(bh))) { if (!folio_test_dirty(folio)) folio_redirty_for_writepage(wbc, folio); keep_towrite = true; } continue; } if (buffer_new(bh)) clear_buffer_new(bh); set_buffer_async_write(bh); clear_buffer_dirty(bh); nr_to_submit++; } while ((bh = bh->b_this_page) != head); /* Nothing to submit? Just unlock the folio... */ if (!nr_to_submit) return 0; bh = head = folio_buffers(folio); /* * If any blocks are being written to an encrypted file, encrypt them * into a bounce page. For simplicity, just encrypt until the last * block which might be needed. This may cause some unneeded blocks * (e.g. holes) to be unnecessarily encrypted, but this is rare and * can't happen in the common case of blocksize == PAGE_SIZE. */ if (fscrypt_inode_uses_fs_layer_crypto(inode)) { gfp_t gfp_flags = GFP_NOFS; unsigned int enc_bytes = round_up(len, i_blocksize(inode)); struct page *bounce_page; /* * Since bounce page allocation uses a mempool, we can only use * a waiting mask (i.e. request guaranteed allocation) on the * first page of the bio. Otherwise it can deadlock. */ if (io->io_bio) gfp_flags = GFP_NOWAIT | __GFP_NOWARN; retry_encrypt: bounce_page = fscrypt_encrypt_pagecache_blocks(&folio->page, enc_bytes, 0, gfp_flags); if (IS_ERR(bounce_page)) { ret = PTR_ERR(bounce_page); if (ret == -ENOMEM && (io->io_bio || wbc->sync_mode == WB_SYNC_ALL)) { gfp_t new_gfp_flags = GFP_NOFS; if (io->io_bio) ext4_io_submit(io); else new_gfp_flags |= __GFP_NOFAIL; memalloc_retry_wait(gfp_flags); gfp_flags = new_gfp_flags; goto retry_encrypt; } printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret); folio_redirty_for_writepage(wbc, folio); do { if (buffer_async_write(bh)) { clear_buffer_async_write(bh); set_buffer_dirty(bh); } bh = bh->b_this_page; } while (bh != head); return ret; } io_folio = page_folio(bounce_page); } __folio_start_writeback(folio, keep_towrite); /* Now submit buffers to write */ do { if (!buffer_async_write(bh)) continue; io_submit_add_bh(io, inode, folio, io_folio, bh); } while ((bh = bh->b_this_page) != head); return 0; } |
54 121 56 56 56 55 55 55 55 184 186 57 183 184 185 184 166 185 184 183 185 185 185 85 80 40 39 11 11 11 11 11 11 11 11 58 54 52 10 85 86 11 86 11 11 85 84 86 86 86 56 85 86 57 54 85 85 86 57 11 56 86 132 11 88 86 50 86 156 157 175 146 172 128 115 185 185 185 185 185 136 171 45 44 45 45 45 45 45 45 1 45 45 45 184 185 185 165 184 182 123 125 181 72 72 144 144 144 144 35 35 35 146 139 45 1 45 43 51 18 182 172 129 184 34 118 86 74 1 34 182 182 119 9 118 184 182 184 177 132 182 184 144 56 173 132 69 77 86 184 172 172 132 182 182 41 146 147 147 147 10 37 94 131 131 144 181 184 7 7 7 3 3 3 13 2 13 2 2 2 52 52 59 58 5 1 1 1 1 6 6 6 10 10 10 10 10 10 10 9 10 122 54 124 13 119 3 3 3 1 2 1 1 1 2 3 3 3 3 3 2 1 1 1 1 58 52 5 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 57 53 5 3 10 10 10 10 10 10 14 14 10 4 2 2 4 4 5 1 10 10 1 10 16 16 15 16 16 22 22 22 20 22 119 118 57 118 119 112 105 119 119 9 117 9 118 53 54 1 1 1 1 1 1 1 1 70 37 64 63 64 5 70 2 54 1 54 29 42 42 42 42 31 42 6 38 42 31 31 42 42 34 34 34 34 34 31 123 122 123 123 120 11 11 11 11 11 123 54 54 53 23 3 23 23 21 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007,2008 Oracle. All rights reserved. */ #include <linux/sched.h> #include <linux/slab.h> #include <linux/rbtree.h> #include <linux/mm.h> #include <linux/error-injection.h> #include "messages.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" #include "print-tree.h" #include "locking.h" #include "volumes.h" #include "qgroup.h" #include "tree-mod-log.h" #include "tree-checker.h" #include "fs.h" #include "accessors.h" #include "extent-tree.h" #include "relocation.h" #include "file-item.h" static struct kmem_cache *btrfs_path_cachep; static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level); static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct btrfs_key *ins_key, struct btrfs_path *path, int data_size, int extend); static int push_node_left(struct btrfs_trans_handle *trans, struct extent_buffer *dst, struct extent_buffer *src, int empty); static int balance_node_right(struct btrfs_trans_handle *trans, struct extent_buffer *dst_buf, struct extent_buffer *src_buf); static const struct btrfs_csums { u16 size; const char name[10]; const char driver[12]; } btrfs_csums[] = { [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" }, [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" }, [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" }, [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b", .driver = "blake2b-256" }, }; /* * The leaf data grows from end-to-front in the node. this returns the address * of the start of the last item, which is the stop of the leaf data stack. */ static unsigned int leaf_data_end(const struct extent_buffer *leaf) { u32 nr = btrfs_header_nritems(leaf); if (nr == 0) return BTRFS_LEAF_DATA_SIZE(leaf->fs_info); return btrfs_item_offset(leaf, nr - 1); } /* * Move data in a @leaf (using memmove, safe for overlapping ranges). * * @leaf: leaf that we're doing a memmove on * @dst_offset: item data offset we're moving to * @src_offset: item data offset were' moving from * @len: length of the data we're moving * * Wrapper around memmove_extent_buffer() that takes into account the header on * the leaf. The btrfs_item offset's start directly after the header, so we * have to adjust any offsets to account for the header in the leaf. This * handles that math to simplify the callers. */ static inline void memmove_leaf_data(const struct extent_buffer *leaf, unsigned long dst_offset, unsigned long src_offset, unsigned long len) { memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, 0) + dst_offset, btrfs_item_nr_offset(leaf, 0) + src_offset, len); } /* * Copy item data from @src into @dst at the given @offset. * * @dst: destination leaf that we're copying into * @src: source leaf that we're copying from * @dst_offset: item data offset we're copying to * @src_offset: item data offset were' copying from * @len: length of the data we're copying * * Wrapper around copy_extent_buffer() that takes into account the header on * the leaf. The btrfs_item offset's start directly after the header, so we * have to adjust any offsets to account for the header in the leaf. This * handles that math to simplify the callers. */ static inline void copy_leaf_data(const struct extent_buffer *dst, const struct extent_buffer *src, unsigned long dst_offset, unsigned long src_offset, unsigned long len) { copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, 0) + dst_offset, btrfs_item_nr_offset(src, 0) + src_offset, len); } /* * Move items in a @leaf (using memmove). * * @dst: destination leaf for the items * @dst_item: the item nr we're copying into * @src_item: the item nr we're copying from * @nr_items: the number of items to copy * * Wrapper around memmove_extent_buffer() that does the math to get the * appropriate offsets into the leaf from the item numbers. */ static inline void memmove_leaf_items(const struct extent_buffer *leaf, int dst_item, int src_item, int nr_items) { memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, dst_item), btrfs_item_nr_offset(leaf, src_item), nr_items * sizeof(struct btrfs_item)); } /* * Copy items from @src into @dst at the given @offset. * * @dst: destination leaf for the items * @src: source leaf for the items * @dst_item: the item nr we're copying into * @src_item: the item nr we're copying from * @nr_items: the number of items to copy * * Wrapper around copy_extent_buffer() that does the math to get the * appropriate offsets into the leaf from the item numbers. */ static inline void copy_leaf_items(const struct extent_buffer *dst, const struct extent_buffer *src, int dst_item, int src_item, int nr_items) { copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, dst_item), btrfs_item_nr_offset(src, src_item), nr_items * sizeof(struct btrfs_item)); } /* This exists for btrfs-progs usages. */ u16 btrfs_csum_type_size(u16 type) { return btrfs_csums[type].size; } int btrfs_super_csum_size(const struct btrfs_super_block *s) { u16 t = btrfs_super_csum_type(s); /* * csum type is validated at mount time */ return btrfs_csum_type_size(t); } const char *btrfs_super_csum_name(u16 csum_type) { /* csum type is validated at mount time */ return btrfs_csums[csum_type].name; } /* * Return driver name if defined, otherwise the name that's also a valid driver * name */ const char *btrfs_super_csum_driver(u16 csum_type) { /* csum type is validated at mount time */ return btrfs_csums[csum_type].driver[0] ? btrfs_csums[csum_type].driver : btrfs_csums[csum_type].name; } size_t __attribute_const__ btrfs_get_num_csums(void) { return ARRAY_SIZE(btrfs_csums); } struct btrfs_path *btrfs_alloc_path(void) { might_sleep(); return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS); } /* this also releases the path */ void btrfs_free_path(struct btrfs_path *p) { if (!p) return; btrfs_release_path(p); kmem_cache_free(btrfs_path_cachep, p); } /* * path release drops references on the extent buffers in the path * and it drops any locks held by this path * * It is safe to call this on paths that no locks or extent buffers held. */ noinline void btrfs_release_path(struct btrfs_path *p) { int i; for (i = 0; i < BTRFS_MAX_LEVEL; i++) { p->slots[i] = 0; if (!p->nodes[i]) continue; if (p->locks[i]) { btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]); p->locks[i] = 0; } free_extent_buffer(p->nodes[i]); p->nodes[i] = NULL; } } /* * We want the transaction abort to print stack trace only for errors where the * cause could be a bug, eg. due to ENOSPC, and not for common errors that are * caused by external factors. */ bool __cold abort_should_print_stack(int error) { switch (error) { case -EIO: case -EROFS: case -ENOMEM: return false; } return true; } /* * safely gets a reference on the root node of a tree. A lock * is not taken, so a concurrent writer may put a different node * at the root of the tree. See btrfs_lock_root_node for the * looping required. * * The extent buffer returned by this has a reference taken, so * it won't disappear. It may stop being the root of the tree * at any time because there are no locks held. */ struct extent_buffer *btrfs_root_node(struct btrfs_root *root) { struct extent_buffer *eb; while (1) { rcu_read_lock(); eb = rcu_dereference(root->node); /* * RCU really hurts here, we could free up the root node because * it was COWed but we may not get the new root node yet so do * the inc_not_zero dance and if it doesn't work then * synchronize_rcu and try again. */ if (atomic_inc_not_zero(&eb->refs)) { rcu_read_unlock(); break; } rcu_read_unlock(); synchronize_rcu(); } return eb; } /* * Cowonly root (not-shareable trees, everything not subvolume or reloc roots), * just get put onto a simple dirty list. Transaction walks this list to make * sure they get properly updated on disk. */ static void add_root_to_dirty_list(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; if (test_bit(BTRFS_ROOT_DIRTY, &root->state) || !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state)) return; spin_lock(&fs_info->trans_lock); if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) { /* Want the extent tree to be the last on the list */ if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID) list_move_tail(&root->dirty_list, &fs_info->dirty_cowonly_roots); else list_move(&root->dirty_list, &fs_info->dirty_cowonly_roots); } spin_unlock(&fs_info->trans_lock); } /* * used by snapshot creation to make a copy of a root for a tree with * a given objectid. The buffer with the new root node is returned in * cow_ret, and this func returns zero on success or a negative error code. */ int btrfs_copy_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer **cow_ret, u64 new_root_objectid) { struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *cow; int ret = 0; int level; struct btrfs_disk_key disk_key; u64 reloc_src_root = 0; WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && trans->transid != fs_info->running_transaction->transid); WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && trans->transid != root->last_trans); level = btrfs_header_level(buf); if (level == 0) btrfs_item_key(buf, &disk_key, 0); else btrfs_node_key(buf, &disk_key, 0); if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) reloc_src_root = btrfs_header_owner(buf); cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid, &disk_key, level, buf->start, 0, reloc_src_root, BTRFS_NESTING_NEW_ROOT); if (IS_ERR(cow)) return PTR_ERR(cow); copy_extent_buffer_full(cow, buf); btrfs_set_header_bytenr(cow, cow->start); btrfs_set_header_generation(cow, trans->transid); btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | BTRFS_HEADER_FLAG_RELOC); if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); else btrfs_set_header_owner(cow, new_root_objectid); write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid); WARN_ON(btrfs_header_generation(buf) > trans->transid); if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) ret = btrfs_inc_ref(trans, root, cow, 1); else ret = btrfs_inc_ref(trans, root, cow, 0); if (ret) { btrfs_tree_unlock(cow); free_extent_buffer(cow); btrfs_abort_transaction(trans, ret); return ret; } btrfs_mark_buffer_dirty(trans, cow); *cow_ret = cow; return 0; } /* * check if the tree block can be shared by multiple trees */ bool btrfs_block_can_be_shared(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf) { const u64 buf_gen = btrfs_header_generation(buf); /* * Tree blocks not in shareable trees and tree roots are never shared. * If a block was allocated after the last snapshot and the block was * not allocated by tree relocation, we know the block is not shared. */ if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) return false; if (buf == root->node) return false; if (buf_gen > btrfs_root_last_snapshot(&root->root_item) && !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) return false; if (buf != root->commit_root) return true; /* * An extent buffer that used to be the commit root may still be shared * because the tree height may have increased and it became a child of a * higher level root. This can happen when snapshotting a subvolume * created in the current transaction. */ if (buf_gen == trans->transid) return true; return false; } static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *cow, int *last_ref) { struct btrfs_fs_info *fs_info = root->fs_info; u64 refs; u64 owner; u64 flags; u64 new_flags = 0; int ret; /* * Backrefs update rules: * * Always use full backrefs for extent pointers in tree block * allocated by tree relocation. * * If a shared tree block is no longer referenced by its owner * tree (btrfs_header_owner(buf) == root->root_key.objectid), * use full backrefs for extent pointers in tree block. * * If a tree block is been relocating * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID), * use full backrefs for extent pointers in tree block. * The reason for this is some operations (such as drop tree) * are only allowed for blocks use full backrefs. */ if (btrfs_block_can_be_shared(trans, root, buf)) { ret = btrfs_lookup_extent_info(trans, fs_info, buf->start, btrfs_header_level(buf), 1, &refs, &flags, NULL); if (ret) return ret; if (unlikely(refs == 0)) { btrfs_crit(fs_info, "found 0 references for tree block at bytenr %llu level %d root %llu", buf->start, btrfs_header_level(buf), btrfs_root_id(root)); ret = -EUCLEAN; btrfs_abort_transaction(trans, ret); return ret; } } else { refs = 1; if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; else flags = 0; } owner = btrfs_header_owner(buf); BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID && !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); if (refs > 1) { if ((owner == root->root_key.objectid || root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { ret = btrfs_inc_ref(trans, root, buf, 1); if (ret) return ret; if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { ret = btrfs_dec_ref(trans, root, buf, 0); if (ret) return ret; ret = btrfs_inc_ref(trans, root, cow, 1); if (ret) return ret; } new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; } else { if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) ret = btrfs_inc_ref(trans, root, cow, 1); else ret = btrfs_inc_ref(trans, root, cow, 0); if (ret) return ret; } if (new_flags != 0) { ret = btrfs_set_disk_extent_flags(trans, buf, new_flags); if (ret) return ret; } } else { if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) ret = btrfs_inc_ref(trans, root, cow, 1); else ret = btrfs_inc_ref(trans, root, cow, 0); if (ret) return ret; ret = btrfs_dec_ref(trans, root, buf, 1); if (ret) return ret; } btrfs_clear_buffer_dirty(trans, buf); *last_ref = 1; } return 0; } /* * does the dirty work in cow of a single block. The parent block (if * supplied) is updated to point to the new cow copy. The new buffer is marked * dirty and returned locked. If you modify the block it needs to be marked * dirty again. * * search_start -- an allocation hint for the new block * * empty_size -- a hint that you plan on doing more cow. This is the size in * bytes the allocator should try to find free next to the block it returns. * This is just a hint and may be ignored by the allocator. */ int btrfs_force_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, struct extent_buffer **cow_ret, u64 search_start, u64 empty_size, enum btrfs_lock_nesting nest) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_disk_key disk_key; struct extent_buffer *cow; int level, ret; int last_ref = 0; int unlock_orig = 0; u64 parent_start = 0; u64 reloc_src_root = 0; if (*cow_ret == buf) unlock_orig = 1; btrfs_assert_tree_write_locked(buf); WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && trans->transid != fs_info->running_transaction->transid); WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && trans->transid != root->last_trans); level = btrfs_header_level(buf); if (level == 0) btrfs_item_key(buf, &disk_key, 0); else btrfs_node_key(buf, &disk_key, 0); if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { if (parent) parent_start = parent->start; reloc_src_root = btrfs_header_owner(buf); } cow = btrfs_alloc_tree_block(trans, root, parent_start, root->root_key.objectid, &disk_key, level, search_start, empty_size, reloc_src_root, nest); if (IS_ERR(cow)) return PTR_ERR(cow); /* cow is set to blocking by btrfs_init_new_buffer */ copy_extent_buffer_full(cow, buf); btrfs_set_header_bytenr(cow, cow->start); btrfs_set_header_generation(cow, trans->transid); btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | BTRFS_HEADER_FLAG_RELOC); if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); else btrfs_set_header_owner(cow, root->root_key.objectid); write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid); ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); if (ret) { btrfs_tree_unlock(cow); free_extent_buffer(cow); btrfs_abort_transaction(trans, ret); return ret; } if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { ret = btrfs_reloc_cow_block(trans, root, buf, cow); if (ret) { btrfs_tree_unlock(cow); free_extent_buffer(cow); btrfs_abort_transaction(trans, ret); return ret; } } if (buf == root->node) { WARN_ON(parent && parent != buf); if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) parent_start = buf->start; ret = btrfs_tree_mod_log_insert_root(root->node, cow, true); if (ret < 0) { btrfs_tree_unlock(cow); free_extent_buffer(cow); btrfs_abort_transaction(trans, ret); return ret; } atomic_inc(&cow->refs); rcu_assign_pointer(root->node, cow); btrfs_free_tree_block(trans, btrfs_root_id(root), buf, parent_start, last_ref); free_extent_buffer(buf); add_root_to_dirty_list(root); } else { WARN_ON(trans->transid != btrfs_header_generation(parent)); ret = btrfs_tree_mod_log_insert_key(parent, parent_slot, BTRFS_MOD_LOG_KEY_REPLACE); if (ret) { btrfs_tree_unlock(cow); free_extent_buffer(cow); btrfs_abort_transaction(trans, ret); return ret; } btrfs_set_node_blockptr(parent, parent_slot, cow->start); btrfs_set_node_ptr_generation(parent, parent_slot, trans->transid); btrfs_mark_buffer_dirty(trans, parent); if (last_ref) { ret = btrfs_tree_mod_log_free_eb(buf); if (ret) { btrfs_tree_unlock(cow); free_extent_buffer(cow); btrfs_abort_transaction(trans, ret); return ret; } } btrfs_free_tree_block(trans, btrfs_root_id(root), buf, parent_start, last_ref); } if (unlock_orig) btrfs_tree_unlock(buf); free_extent_buffer_stale(buf); btrfs_mark_buffer_dirty(trans, cow); *cow_ret = cow; return 0; } static inline int should_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf) { if (btrfs_is_testing(root->fs_info)) return 0; /* Ensure we can see the FORCE_COW bit */ smp_mb__before_atomic(); /* * We do not need to cow a block if * 1) this block is not created or changed in this transaction; * 2) this block does not belong to TREE_RELOC tree; * 3) the root is not forced COW. * * What is forced COW: * when we create snapshot during committing the transaction, * after we've finished copying src root, we must COW the shared * block to ensure the metadata consistency. */ if (btrfs_header_generation(buf) == trans->transid && !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) && !test_bit(BTRFS_ROOT_FORCE_COW, &root->state)) return 0; return 1; } /* * COWs a single block, see btrfs_force_cow_block() for the real work. * This version of it has extra checks so that a block isn't COWed more than * once per transaction, as long as it hasn't been written yet */ int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, struct extent_buffer **cow_ret, enum btrfs_lock_nesting nest) { struct btrfs_fs_info *fs_info = root->fs_info; u64 search_start; int ret; if (unlikely(test_bit(BTRFS_ROOT_DELETING, &root->state))) { btrfs_abort_transaction(trans, -EUCLEAN); btrfs_crit(fs_info, "attempt to COW block %llu on root %llu that is being deleted", buf->start, btrfs_root_id(root)); return -EUCLEAN; } /* * COWing must happen through a running transaction, which always * matches the current fs generation (it's a transaction with a state * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs * into error state to prevent the commit of any transaction. */ if (unlikely(trans->transaction != fs_info->running_transaction || trans->transid != fs_info->generation)) { btrfs_abort_transaction(trans, -EUCLEAN); btrfs_crit(fs_info, "unexpected transaction when attempting to COW block %llu on root %llu, transaction %llu running transaction %llu fs generation %llu", buf->start, btrfs_root_id(root), trans->transid, fs_info->running_transaction->transid, fs_info->generation); return -EUCLEAN; } if (!should_cow_block(trans, root, buf)) { *cow_ret = buf; return 0; } search_start = round_down(buf->start, SZ_1G); /* * Before CoWing this block for later modification, check if it's * the subtree root and do the delayed subtree trace if needed. * * Also We don't care about the error, as it's handled internally. */ btrfs_qgroup_trace_subtree_after_cow(trans, root, buf); ret = btrfs_force_cow_block(trans, root, buf, parent, parent_slot, cow_ret, search_start, 0, nest); trace_btrfs_cow_block(root, buf, *cow_ret); return ret; } ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO); /* * same as comp_keys only with two btrfs_key's */ int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2) { if (k1->objectid > k2->objectid) return 1; if (k1->objectid < k2->objectid) return -1; if (k1->type > k2->type) return 1; if (k1->type < k2->type) return -1; if (k1->offset > k2->offset) return 1; if (k1->offset < k2->offset) return -1; return 0; } /* * Search for a key in the given extent_buffer. * * The lower boundary for the search is specified by the slot number @first_slot. * Use a value of 0 to search over the whole extent buffer. Works for both * leaves and nodes. * * The slot in the extent buffer is returned via @slot. If the key exists in the * extent buffer, then @slot will point to the slot where the key is, otherwise * it points to the slot where you would insert the key. * * Slot may point to the total number of items (i.e. one position beyond the last * key) if the key is bigger than the last key in the extent buffer. */ int btrfs_bin_search(struct extent_buffer *eb, int first_slot, const struct btrfs_key *key, int *slot) { unsigned long p; int item_size; /* * Use unsigned types for the low and high slots, so that we get a more * efficient division in the search loop below. */ u32 low = first_slot; u32 high = btrfs_header_nritems(eb); int ret; const int key_size = sizeof(struct btrfs_disk_key); if (unlikely(low > high)) { btrfs_err(eb->fs_info, "%s: low (%u) > high (%u) eb %llu owner %llu level %d", __func__, low, high, eb->start, btrfs_header_owner(eb), btrfs_header_level(eb)); return -EINVAL; } if (btrfs_header_level(eb) == 0) { p = offsetof(struct btrfs_leaf, items); item_size = sizeof(struct btrfs_item); } else { p = offsetof(struct btrfs_node, ptrs); item_size = sizeof(struct btrfs_key_ptr); } while (low < high) { const int unit_size = folio_size(eb->folios[0]); unsigned long oil; unsigned long offset; struct btrfs_disk_key *tmp; struct btrfs_disk_key unaligned; int mid; mid = (low + high) / 2; offset = p + mid * item_size; oil = get_eb_offset_in_folio(eb, offset); if (oil + key_size <= unit_size) { const unsigned long idx = get_eb_folio_index(eb, offset); char *kaddr = folio_address(eb->folios[idx]); oil = get_eb_offset_in_folio(eb, offset); tmp = (struct btrfs_disk_key *)(kaddr + oil); } else { read_extent_buffer(eb, &unaligned, offset, key_size); tmp = &unaligned; } ret = btrfs_comp_keys(tmp, key); if (ret < 0) low = mid + 1; else if (ret > 0) high = mid; else { *slot = mid; return 0; } } *slot = low; return 1; } static void root_add_used_bytes(struct btrfs_root *root) { spin_lock(&root->accounting_lock); btrfs_set_root_used(&root->root_item, btrfs_root_used(&root->root_item) + root->fs_info->nodesize); spin_unlock(&root->accounting_lock); } static void root_sub_used_bytes(struct btrfs_root *root) { spin_lock(&root->accounting_lock); btrfs_set_root_used(&root->root_item, btrfs_root_used(&root->root_item) - root->fs_info->nodesize); spin_unlock(&root->accounting_lock); } /* given a node and slot number, this reads the blocks it points to. The * extent buffer is returned with a reference taken (but unlocked). */ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, int slot) { int level = btrfs_header_level(parent); struct btrfs_tree_parent_check check = { 0 }; struct extent_buffer *eb; if (slot < 0 || slot >= btrfs_header_nritems(parent)) return ERR_PTR(-ENOENT); ASSERT(level); check.level = level - 1; check.transid = btrfs_node_ptr_generation(parent, slot); check.owner_root = btrfs_header_owner(parent); check.has_first_key = true; btrfs_node_key_to_cpu(parent, &check.first_key, slot); eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot), &check); if (IS_ERR(eb)) return eb; if (!extent_buffer_uptodate(eb)) { free_extent_buffer(eb); return ERR_PTR(-EIO); } return eb; } /* * node level balancing, used to make sure nodes are in proper order for * item deletion. We balance from the top down, so we have to make sure * that a deletion won't leave an node completely empty later on. */ static noinline int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level) { struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *right = NULL; struct extent_buffer *mid; struct extent_buffer *left = NULL; struct extent_buffer *parent = NULL; int ret = 0; int wret; int pslot; int orig_slot = path->slots[level]; u64 orig_ptr; ASSERT(level > 0); mid = path->nodes[level]; WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK); WARN_ON(btrfs_header_generation(mid) != trans->transid); orig_ptr = btrfs_node_blockptr(mid, orig_slot); if (level < BTRFS_MAX_LEVEL - 1) { parent = path->nodes[level + 1]; pslot = path->slots[level + 1]; } /* * deal with the case where there is only one pointer in the root * by promoting the node below to a root */ if (!parent) { struct extent_buffer *child; if (btrfs_header_nritems(mid) != 1) return 0; /* promote the child to a root */ child = btrfs_read_node_slot(mid, 0); if (IS_ERR(child)) { ret = PTR_ERR(child); goto out; } btrfs_tree_lock(child); ret = btrfs_cow_block(trans, root, child, mid, 0, &child, BTRFS_NESTING_COW); if (ret) { btrfs_tree_unlock(child); free_extent_buffer(child); goto out; } ret = btrfs_tree_mod_log_insert_root(root->node, child, true); if (ret < 0) { btrfs_tree_unlock(child); free_extent_buffer(child); btrfs_abort_transaction(trans, ret); goto out; } rcu_assign_pointer(root->node, child); add_root_to_dirty_list(root); btrfs_tree_unlock(child); path->locks[level] = 0; path->nodes[level] = NULL; btrfs_clear_buffer_dirty(trans, mid); btrfs_tree_unlock(mid); /* once for the path */ free_extent_buffer(mid); root_sub_used_bytes(root); btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); /* once for the root ptr */ free_extent_buffer_stale(mid); return 0; } if (btrfs_header_nritems(mid) > BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4) return 0; if (pslot) { left = btrfs_read_node_slot(parent, pslot - 1); if (IS_ERR(left)) { ret = PTR_ERR(left); left = NULL; goto out; } __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); wret = btrfs_cow_block(trans, root, left, parent, pslot - 1, &left, BTRFS_NESTING_LEFT_COW); if (wret) { ret = wret; goto out; } } if (pslot + 1 < btrfs_header_nritems(parent)) { right = btrfs_read_node_slot(parent, pslot + 1); if (IS_ERR(right)) { ret = PTR_ERR(right); right = NULL; goto out; } __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); wret = btrfs_cow_block(trans, root, right, parent, pslot + 1, &right, BTRFS_NESTING_RIGHT_COW); if (wret) { ret = wret; goto out; } } /* first, try to make some room in the middle buffer */ if (left) { orig_slot += btrfs_header_nritems(left); wret = push_node_left(trans, left, mid, 1); if (wret < 0) ret = wret; } /* * then try to empty the right most buffer into the middle */ if (right) { wret = push_node_left(trans, mid, right, 1); if (wret < 0 && wret != -ENOSPC) ret = wret; if (btrfs_header_nritems(right) == 0) { btrfs_clear_buffer_dirty(trans, right); btrfs_tree_unlock(right); ret = btrfs_del_ptr(trans, root, path, level + 1, pslot + 1); if (ret < 0) { free_extent_buffer_stale(right); right = NULL; goto out; } root_sub_used_bytes(root); btrfs_free_tree_block(trans, btrfs_root_id(root), right, 0, 1); free_extent_buffer_stale(right); right = NULL; } else { struct btrfs_disk_key right_key; btrfs_node_key(right, &right_key, 0); ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, BTRFS_MOD_LOG_KEY_REPLACE); if (ret < 0) { btrfs_abort_transaction(trans, ret); goto out; } btrfs_set_node_key(parent, &right_key, pslot + 1); btrfs_mark_buffer_dirty(trans, parent); } } if (btrfs_header_nritems(mid) == 1) { /* * we're not allowed to leave a node with one item in the * tree during a delete. A deletion from lower in the tree * could try to delete the only pointer in this node. * So, pull some keys from the left. * There has to be a left pointer at this point because * otherwise we would have pulled some pointers from the * right */ if (unlikely(!left)) { btrfs_crit(fs_info, "missing left child when middle child only has 1 item, parent bytenr %llu level %d mid bytenr %llu root %llu", parent->start, btrfs_header_level(parent), mid->start, btrfs_root_id(root)); ret = -EUCLEAN; btrfs_abort_transaction(trans, ret); goto out; } wret = balance_node_right(trans, mid, left); if (wret < 0) { ret = wret; goto out; } if (wret == 1) { wret = push_node_left(trans, left, mid, 1); if (wret < 0) ret = wret; } BUG_ON(wret == 1); } if (btrfs_header_nritems(mid) == 0) { btrfs_clear_buffer_dirty(trans, mid); btrfs_tree_unlock(mid); ret = btrfs_del_ptr(trans, root, path, level + 1, pslot); if (ret < 0) { free_extent_buffer_stale(mid); mid = NULL; goto out; } root_sub_used_bytes(root); btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); free_extent_buffer_stale(mid); mid = NULL; } else { /* update the parent key to reflect our changes */ struct btrfs_disk_key mid_key; btrfs_node_key(mid, &mid_key, 0); ret = btrfs_tree_mod_log_insert_key(parent, pslot, BTRFS_MOD_LOG_KEY_REPLACE); if (ret < 0) { btrfs_abort_transaction(trans, ret); goto out; } btrfs_set_node_key(parent, &mid_key, pslot); btrfs_mark_buffer_dirty(trans, parent); } /* update the path */ if (left) { if (btrfs_header_nritems(left) > orig_slot) { atomic_inc(&left->refs); /* left was locked after cow */ path->nodes[level] = left; path->slots[level + 1] -= 1; path->slots[level] = orig_slot; if (mid) { btrfs_tree_unlock(mid); free_extent_buffer(mid); } } else { orig_slot -= btrfs_header_nritems(left); path->slots[level] = orig_slot; } } /* double check we haven't messed things up */ if (orig_ptr != btrfs_node_blockptr(path->nodes[level], path->slots[level])) BUG(); out: if (right) { btrfs_tree_unlock(right); free_extent_buffer(right); } if (left) { if (path->nodes[level] != left) btrfs_tree_unlock(left); free_extent_buffer(left); } return ret; } /* Node balancing for insertion. Here we only split or push nodes around * when they are completely full. This is also done top down, so we * have to be pessimistic. */ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level) { struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *right = NULL; struct extent_buffer *mid; struct extent_buffer *left = NULL; struct extent_buffer *parent = NULL; int ret = 0; int wret; int pslot; int orig_slot = path->slots[level]; if (level == 0) return 1; mid = path->nodes[level]; WARN_ON(btrfs_header_generation(mid) != trans->transid); if (level < BTRFS_MAX_LEVEL - 1) { parent = path->nodes[level + 1]; pslot = path->slots[level + 1]; } if (!parent) return 1; /* first, try to make some room in the middle buffer */ if (pslot) { u32 left_nr; left = btrfs_read_node_slot(parent, pslot - 1); if (IS_ERR(left)) return PTR_ERR(left); __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); left_nr = btrfs_header_nritems(left); if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { wret = 1; } else { ret = btrfs_cow_block(trans, root, left, parent, pslot - 1, &left, BTRFS_NESTING_LEFT_COW); if (ret) wret = 1; else { wret = push_node_left(trans, left, mid, 0); } } if (wret < 0) ret = wret; if (wret == 0) { struct btrfs_disk_key disk_key; orig_slot += left_nr; btrfs_node_key(mid, &disk_key, 0); ret = btrfs_tree_mod_log_insert_key(parent, pslot, BTRFS_MOD_LOG_KEY_REPLACE); if (ret < 0) { btrfs_tree_unlock(left); free_extent_buffer(left); btrfs_abort_transaction(trans, ret); return ret; } btrfs_set_node_key(parent, &disk_key, pslot); btrfs_mark_buffer_dirty(trans, parent); if (btrfs_header_nritems(left) > orig_slot) { path->nodes[level] = left; path->slots[level + 1] -= 1; path->slots[level] = orig_slot; btrfs_tree_unlock(mid); free_extent_buffer(mid); } else { orig_slot -= btrfs_header_nritems(left); path->slots[level] = orig_slot; btrfs_tree_unlock(left); free_extent_buffer(left); } return 0; } btrfs_tree_unlock(left); free_extent_buffer(left); } /* * then try to empty the right most buffer into the middle */ if (pslot + 1 < btrfs_header_nritems(parent)) { u32 right_nr; right = btrfs_read_node_slot(parent, pslot + 1); if (IS_ERR(right)) return PTR_ERR(right); __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); right_nr = btrfs_header_nritems(right); if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { wret = 1; } else { ret = btrfs_cow_block(trans, root, right, parent, pslot + 1, &right, BTRFS_NESTING_RIGHT_COW); if (ret) wret = 1; else { wret = balance_node_right(trans, right, mid); } } if (wret < 0) ret = wret; if (wret == 0) { struct btrfs_disk_key disk_key; btrfs_node_key(right, &disk_key, 0); ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, BTRFS_MOD_LOG_KEY_REPLACE); if (ret < 0) { btrfs_tree_unlock(right); free_extent_buffer(right); btrfs_abort_transaction(trans, ret); return ret; } btrfs_set_node_key(parent, &disk_key, pslot + 1); btrfs_mark_buffer_dirty(trans, parent); if (btrfs_header_nritems(mid) <= orig_slot) { path->nodes[level] = right; path->slots[level + 1] += 1; path->slots[level] = orig_slot - btrfs_header_nritems(mid); btrfs_tree_unlock(mid); free_extent_buffer(mid); } else { btrfs_tree_unlock(right); free_extent_buffer(right); } return 0; } btrfs_tree_unlock(right); free_extent_buffer(right); } return 1; } /* * readahead one full node of leaves, finding things that are close * to the block in 'slot', and triggering ra on them. */ static void reada_for_search(struct btrfs_fs_info *fs_info, struct btrfs_path *path, int level, int slot, u64 objectid) { struct extent_buffer *node; struct btrfs_disk_key disk_key; u32 nritems; u64 search; u64 target; u64 nread = 0; u64 nread_max; u32 nr; u32 blocksize; u32 nscan = 0; if (level != 1 && path->reada != READA_FORWARD_ALWAYS) return; if (!path->nodes[level]) return; node = path->nodes[level]; /* * Since the time between visiting leaves is much shorter than the time * between visiting nodes, limit read ahead of nodes to 1, to avoid too * much IO at once (possibly random). */ if (path->reada == READA_FORWARD_ALWAYS) { if (level > 1) nread_max = node->fs_info->nodesize; else nread_max = SZ_128K; } else { nread_max = SZ_64K; } search = btrfs_node_blockptr(node, slot); blocksize = fs_info->nodesize; if (path->reada != READA_FORWARD_ALWAYS) { struct extent_buffer *eb; eb = find_extent_buffer(fs_info, search); if (eb) { free_extent_buffer(eb); return; } } target = search; nritems = btrfs_header_nritems(node); nr = slot; while (1) { if (path->reada == READA_BACK) { if (nr == 0) break; nr--; } else if (path->reada == READA_FORWARD || path->reada == READA_FORWARD_ALWAYS) { nr++; if (nr >= nritems) break; } if (path->reada == READA_BACK && objectid) { btrfs_node_key(node, &disk_key, nr); if (btrfs_disk_key_objectid(&disk_key) != objectid) break; } search = btrfs_node_blockptr(node, nr); if (path->reada == READA_FORWARD_ALWAYS || (search <= target && target - search <= 65536) || (search > target && search - target <= 65536)) { btrfs_readahead_node_child(node, nr); nread += blocksize; } nscan++; if (nread > nread_max || nscan > 32) break; } } static noinline void reada_for_balance(struct btrfs_path *path, int level) { struct extent_buffer *parent; int slot; int nritems; parent = path->nodes[level + 1]; if (!parent) return; nritems = btrfs_header_nritems(parent); slot = path->slots[level + 1]; if (slot > 0) btrfs_readahead_node_child(parent, slot - 1); if (slot + 1 < nritems) btrfs_readahead_node_child(parent, slot + 1); } /* * when we walk down the tree, it is usually safe to unlock the higher layers * in the tree. The exceptions are when our path goes through slot 0, because * operations on the tree might require changing key pointers higher up in the * tree. * * callers might also have set path->keep_locks, which tells this code to keep * the lock if the path points to the last slot in the block. This is part of * walking through the tree, and selecting the next slot in the higher block. * * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so * if lowest_unlock is 1, level 0 won't be unlocked */ static noinline void unlock_up(struct btrfs_path *path, int level, int lowest_unlock, int min_write_lock_level, int *write_lock_level) { int i; int skip_level = level; bool check_skip = true; for (i = level; i < BTRFS_MAX_LEVEL; i++) { if (!path->nodes[i]) break; if (!path->locks[i]) break; if (check_skip) { if (path->slots[i] == 0) { skip_level = i + 1; continue; } if (path->keep_locks) { u32 nritems; nritems = btrfs_header_nritems(path->nodes[i]); if (nritems < 1 || path->slots[i] >= nritems - 1) { skip_level = i + 1; continue; } } } if (i >= lowest_unlock && i > skip_level) { check_skip = false; btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); path->locks[i] = 0; if (write_lock_level && i > min_write_lock_level && i <= *write_lock_level) { *write_lock_level = i - 1; } } } } /* * Helper function for btrfs_search_slot() and other functions that do a search * on a btree. The goal is to find a tree block in the cache (the radix tree at * fs_info->buffer_radix), but if we can't find it, or it's not up to date, read * its pages from disk. * * Returns -EAGAIN, with the path unlocked, if the caller needs to repeat the * whole btree search, starting again from the current root node. */ static int read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, struct extent_buffer **eb_ret, int level, int slot, const struct btrfs_key *key) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_tree_parent_check check = { 0 }; u64 blocknr; u64 gen; struct extent_buffer *tmp; int ret; int parent_level; bool unlock_up; unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]); blocknr = btrfs_node_blockptr(*eb_ret, slot); gen = btrfs_node_ptr_generation(*eb_ret, slot); parent_level = btrfs_header_level(*eb_ret); btrfs_node_key_to_cpu(*eb_ret, &check.first_key, slot); check.has_first_key = true; check.level = parent_level - 1; check.transid = gen; check.owner_root = root->root_key.objectid; /* * If we need to read an extent buffer from disk and we are holding locks * on upper level nodes, we unlock all the upper nodes before reading the * extent buffer, and then return -EAGAIN to the caller as it needs to * restart the search. We don't release the lock on the current level * because we need to walk this node to figure out which blocks to read. */ tmp = find_extent_buffer(fs_info, blocknr); if (tmp) { if (p->reada == READA_FORWARD_ALWAYS) reada_for_search(fs_info, p, level, slot, key->objectid); /* first we do an atomic uptodate check */ if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { /* * Do extra check for first_key, eb can be stale due to * being cached, read from scrub, or have multiple * parents (shared tree blocks). */ if (btrfs_verify_level_key(tmp, parent_level - 1, &check.first_key, gen)) { free_extent_buffer(tmp); return -EUCLEAN; } *eb_ret = tmp; return 0; } if (p->nowait) { free_extent_buffer(tmp); return -EAGAIN; } if (unlock_up) btrfs_unlock_up_safe(p, level + 1); /* now we're allowed to do a blocking uptodate check */ ret = btrfs_read_extent_buffer(tmp, &check); if (ret) { free_extent_buffer(tmp); btrfs_release_path(p); return -EIO; } if (btrfs_check_eb_owner(tmp, root->root_key.objectid)) { free_extent_buffer(tmp); btrfs_release_path(p); return -EUCLEAN; } if (unlock_up) ret = -EAGAIN; goto out; } else if (p->nowait) { return -EAGAIN; } if (unlock_up) { btrfs_unlock_up_safe(p, level + 1); ret = -EAGAIN; } else { ret = 0; } if (p->reada != READA_NONE) reada_for_search(fs_info, p, level, slot, key->objectid); tmp = read_tree_block(fs_info, blocknr, &check); if (IS_ERR(tmp)) { btrfs_release_path(p); return PTR_ERR(tmp); } /* * If the read above didn't mark this buffer up to date, * it will never end up being up to date. Set ret to EIO now * and give up so that our caller doesn't loop forever * on our EAGAINs. */ if (!extent_buffer_uptodate(tmp)) ret = -EIO; out: if (ret == 0) { *eb_ret = tmp; } else { free_extent_buffer(tmp); btrfs_release_path(p); } return ret; } /* * helper function for btrfs_search_slot. This does all of the checks * for node-level blocks and does any balancing required based on * the ins_len. * * If no extra work was required, zero is returned. If we had to * drop the path, -EAGAIN is returned and btrfs_search_slot must * start over */ static int setup_nodes_for_search(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *p, struct extent_buffer *b, int level, int ins_len, int *write_lock_level) { struct btrfs_fs_info *fs_info = root->fs_info; int ret = 0; if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) { if (*write_lock_level < level + 1) { *write_lock_level = level + 1; btrfs_release_path(p); return -EAGAIN; } reada_for_balance(p, level); ret = split_node(trans, root, p, level); b = p->nodes[level]; } else if (ins_len < 0 && btrfs_header_nritems(b) < BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) { if (*write_lock_level < level + 1) { *write_lock_level = level + 1; btrfs_release_path(p); return -EAGAIN; } reada_for_balance(p, level); ret = balance_level(trans, root, p, level); if (ret) return ret; b = p->nodes[level]; if (!b) { btrfs_release_path(p); return -EAGAIN; } BUG_ON(btrfs_header_nritems(b) == 1); } return ret; } int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path, u64 iobjectid, u64 ioff, u8 key_type, struct btrfs_key *found_key) { int ret; struct btrfs_key key; struct extent_buffer *eb; ASSERT(path); ASSERT(found_key); key.type = key_type; key.objectid = iobjectid; key.offset = ioff; ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0); if (ret < 0) return ret; eb = path->nodes[0]; if (ret && path->slots[0] >= btrfs_header_nritems(eb)) { ret = btrfs_next_leaf(fs_root, path); if (ret) return ret; eb = path->nodes[0]; } btrfs_item_key_to_cpu(eb, found_key, path->slots[0]); if (found_key->type != key.type || found_key->objectid != key.objectid) return 1; return 0; } static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, struct btrfs_path *p, int write_lock_level) { struct extent_buffer *b; int root_lock = 0; int level = 0; if (p->search_commit_root) { b = root->commit_root; atomic_inc(&b->refs); level = btrfs_header_level(b); /* * Ensure that all callers have set skip_locking when * p->search_commit_root = 1. */ ASSERT(p->skip_locking == 1); goto out; } if (p->skip_locking) { b = btrfs_root_node(root); level = btrfs_header_level(b); goto out; } /* We try very hard to do read locks on the root */ root_lock = BTRFS_READ_LOCK; /* * If the level is set to maximum, we can skip trying to get the read * lock. */ if (write_lock_level < BTRFS_MAX_LEVEL) { /* * We don't know the level of the root node until we actually * have it read locked */ if (p->nowait) { b = btrfs_try_read_lock_root_node(root); if (IS_ERR(b)) return b; } else { b = btrfs_read_lock_root_node(root); } level = btrfs_header_level(b); if (level > write_lock_level) goto out; /* Whoops, must trade for write lock */ btrfs_tree_read_unlock(b); free_extent_buffer(b); } b = btrfs_lock_root_node(root); root_lock = BTRFS_WRITE_LOCK; /* The level might have changed, check again */ level = btrfs_header_level(b); out: /* * The root may have failed to write out at some point, and thus is no * longer valid, return an error in this case. */ if (!extent_buffer_uptodate(b)) { if (root_lock) btrfs_tree_unlock_rw(b, root_lock); free_extent_buffer(b); return ERR_PTR(-EIO); } p->nodes[level] = b; if (!p->skip_locking) p->locks[level] = root_lock; /* * Callers are responsible for dropping b's references. */ return b; } /* * Replace the extent buffer at the lowest level of the path with a cloned * version. The purpose is to be able to use it safely, after releasing the * commit root semaphore, even if relocation is happening in parallel, the * transaction used for relocation is committed and the extent buffer is * reallocated in the next transaction. * * This is used in a context where the caller does not prevent transaction * commits from happening, either by holding a transaction handle or holding * some lock, while it's doing searches through a commit root. * At the moment it's only used for send operations. */ static int finish_need_commit_sem_search(struct btrfs_path *path) { const int i = path->lowest_level; const int slot = path->slots[i]; struct extent_buffer *lowest = path->nodes[i]; struct extent_buffer *clone; ASSERT(path->need_commit_sem); if (!lowest) return 0; lockdep_assert_held_read(&lowest->fs_info->commit_root_sem); clone = btrfs_clone_extent_buffer(lowest); if (!clone) return -ENOMEM; btrfs_release_path(path); path->nodes[i] = clone; path->slots[i] = slot; return 0; } static inline int search_for_key_slot(struct extent_buffer *eb, int search_low_slot, const struct btrfs_key *key, int prev_cmp, int *slot) { /* * If a previous call to btrfs_bin_search() on a parent node returned an * exact match (prev_cmp == 0), we can safely assume the target key will * always be at slot 0 on lower levels, since each key pointer * (struct btrfs_key_ptr) refers to the lowest key accessible from the * subtree it points to. Thus we can skip searching lower levels. */ if (prev_cmp == 0) { *slot = 0; return 0; } return btrfs_bin_search(eb, search_low_slot, key, slot); } static int search_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct btrfs_key *key, struct btrfs_path *path, int ins_len, int prev_cmp) { struct extent_buffer *leaf = path->nodes[0]; int leaf_free_space = -1; int search_low_slot = 0; int ret; bool do_bin_search = true; /* * If we are doing an insertion, the leaf has enough free space and the * destination slot for the key is not slot 0, then we can unlock our * write lock on the parent, and any other upper nodes, before doing the * binary search on the leaf (with search_for_key_slot()), allowing other * tasks to lock the parent and any other upper nodes. */ if (ins_len > 0) { /* * Cache the leaf free space, since we will need it later and it * will not change until then. */ leaf_free_space = btrfs_leaf_free_space(leaf); /* * !path->locks[1] means we have a single node tree, the leaf is * the root of the tree. */ if (path->locks[1] && leaf_free_space >= ins_len) { struct btrfs_disk_key first_key; ASSERT(btrfs_header_nritems(leaf) > 0); btrfs_item_key(leaf, &first_key, 0); /* * Doing the extra comparison with the first key is cheap, * taking into account that the first key is very likely * already in a cache line because it immediately follows * the extent buffer's header and we have recently accessed * the header's level field. */ ret = btrfs_comp_keys(&first_key, key); if (ret < 0) { /* * The first key is smaller than the key we want * to insert, so we are safe to unlock all upper * nodes and we have to do the binary search. * * We do use btrfs_unlock_up_safe() and not * unlock_up() because the later does not unlock * nodes with a slot of 0 - we can safely unlock * any node even if its slot is 0 since in this * case the key does not end up at slot 0 of the * leaf and there's no need to split the leaf. */ btrfs_unlock_up_safe(path, 1); search_low_slot = 1; } else { /* * The first key is >= then the key we want to * insert, so we can skip the binary search as * the target key will be at slot 0. * * We can not unlock upper nodes when the key is * less than the first key, because we will need * to update the key at slot 0 of the parent node * and possibly of other upper nodes too. * If the key matches the first key, then we can * unlock all the upper nodes, using * btrfs_unlock_up_safe() instead of unlock_up() * as stated above. */ if (ret == 0) btrfs_unlock_up_safe(path, 1); /* * ret is already 0 or 1, matching the result of * a btrfs_bin_search() call, so there is no need * to adjust it. */ do_bin_search = false; path->slots[0] = 0; } } } if (do_bin_search) { ret = search_for_key_slot(leaf, search_low_slot, key, prev_cmp, &path->slots[0]); if (ret < 0) return ret; } if (ins_len > 0) { /* * Item key already exists. In this case, if we are allowed to * insert the item (for example, in dir_item case, item key * collision is allowed), it will be merged with the original * item. Only the item size grows, no new btrfs item will be * added. If search_for_extension is not set, ins_len already * accounts the size btrfs_item, deduct it here so leaf space * check will be correct. */ if (ret == 0 && !path->search_for_extension) { ASSERT(ins_len >= sizeof(struct btrfs_item)); ins_len -= sizeof(struct btrfs_item); } ASSERT(leaf_free_space >= 0); if (leaf_free_space < ins_len) { int err; err = split_leaf(trans, root, key, path, ins_len, (ret == 0)); ASSERT(err <= 0); if (WARN_ON(err > 0)) err = -EUCLEAN; if (err) ret = err; } } return ret; } /* * Look for a key in a tree and perform necessary modifications to preserve * tree invariants. * * @trans: Handle of transaction, used when modifying the tree * @p: Holds all btree nodes along the search path * @root: The root node of the tree * @key: The key we are looking for * @ins_len: Indicates purpose of search: * >0 for inserts it's size of item inserted (*) * <0 for deleti |